In [None]:
# STEP 1: Install required libraries
!pip install transformers pandas matplotlib wordcloud -q


# STEP 2: Import Libraries
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from IPython.display import display


# STEP 3: Load Models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load FLAN-T5 for feature extraction
t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base").to(device)
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Load DistilBERT sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")


# STEP 4: Upload CSV File
from google.colab import files
uploaded = files.upload()

filename = next(iter(uploaded))
df = pd.read_csv(filename)
df = df.dropna(subset=["review"])
df = df.reset_index(drop=True)
display(df.head())


# STEP 5: Helper Functions
def clean_and_split(text):
    return [t.strip().lower() for t in text.split(",") if t.strip()]

def extract_features_with_sentiment(review):
    prompt = f"Extract product features from the following review:\n\n{review}\n\nFeatures:"
    inputs = t5_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = t5_model.generate(**inputs, max_new_tokens=50)
    features = t5_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    feature_list = clean_and_split(features)
    sentiments = []
    for feature in feature_list:
        result = sentiment_analyzer(f"{feature} in context of: {review}")[0]
        sentiments.append((feature, result['label'], result['score']))
    return features, sentiments


# STEP 6: Extract Features and Sentiment
df[["features", "feature_sentiments"]] = df["review"].apply(
    lambda r: pd.Series(extract_features_with_sentiment(r))
)

df.head(10)


# STEP 7: Analyze Most/Least Mentioned Features
all_features = []
for feats in df["features"]:
    all_features.extend(clean_and_split(feats))

feature_counts = Counter(all_features)
print("Top 10 Features:", feature_counts.most_common(10))
print("Least 10 Features:", feature_counts.most_common()[-10:])


# STEP 8: Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(feature_counts)
plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Product Feature Mentions Word Cloud", fontsize=20)
plt.show()


# STEP 9: Sentiment Bar Chart
sentiment_data = []
for row in df["feature_sentiments"]:
    sentiment_data.extend(row)

sentiment_df = pd.DataFrame(sentiment_data, columns=["feature", "sentiment", "confidence"])
sentiment_counts = sentiment_df.groupby(["feature", "sentiment"]).size().unstack(fill_value=0)

# Plot
sentiment_counts.plot(kind="bar", stacked=True, figsize=(14, 6), colormap="Set3")
plt.title("Sentiment Distribution by Feature")
plt.ylabel("Mention Count")
plt.xlabel("Feature")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# STEP 10: Export Results
df.to_csv("extracted_feature_sentiment_reviews.csv", index=False)
files.download("extracted_feature_sentiment_reviews.csv")

