# Flipkart Reviews Sentiment Analysis
This notebook performs sentiment analysis on Flipkart reviews using TF-IDF and a Decision Tree classifier.

In [1]:
# Install required libraries (run if needed)
!pip install pandas nltk scikit-learn matplotlib seaborn wordcloud

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.8-cp310-cp310-win_amd64.whl.metadata (52 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.6-cp310-cp310-win_amd64.whl.metadata (3.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.1-cp310-cp310-win_amd64.whl.metadata (116 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp310-cp310-win_amd64.whl.metadata (6.4 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Downloading matplotlib-3.10.8-cp310-cp310-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:-

In [None]:
# Import Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

nltk.download('stopwords')

In [None]:
# Load Dataset
# Make sure flipkart_data.csv is in the same folder
file_path = "flipkart_data.csv"
df = pd.read_csv(file_path)
df.head()

In [None]:
# Preprocessing
stop_words = set(stopwords.words("english"))

def preprocess_reviews(df):
    df["review"] = df["review"].str.lower()
    df["review"] = df["review"].apply(
        lambda text: " ".join([w for w in text.split() if w not in stop_words])
    )
    df["sentiment"] = df["rating"].apply(lambda r: 1 if r >= 4 else 0)
    return df

df_cleaned = preprocess_reviews(df)
df_cleaned.head()

In [None]:
# Sentiment Distribution
sentiment_counts = df_cleaned["sentiment"].value_counts()
plt.figure(figsize=(6,4))
sentiment_counts.plot(kind="bar", color=["red", "green"])
plt.title("Sentiment Distribution (0 = Negative, 1 = Positive)")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

In [None]:
# Word Cloud for Positive Reviews
positive_text = " ".join(df_cleaned[df_cleaned["sentiment"] == 1]["review"])
wordcloud = WordCloud(width=800, height=400).generate(positive_text)
plt.figure(figsize=(8,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud for Positive Reviews")
plt.show()

In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_cleaned["review"])
y = df_cleaned["sentiment"]

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Train Decision Tree Model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)

plt.figure(figsize=(6,5))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

: 