<a href="https://colab.research.google.com/github/Neetu24/Flipkart-Reviews-Sentiment-Analysis/blob/main/Flipkart_Reviews_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# üì¶ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re, string, warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

warnings.filterwarnings('ignore')
sns.set(style='whitegrid')

# üì• Load Dataset
df = pd.read_csv('/content/flipkart_data.csv')
print("üßæ Dataset columns:", df.columns)

# üîÑ Rename likely columns
for col in df.columns:
    if 'review' in col.lower():
        df.rename(columns={col: 'review'}, inplace=True)
    if 'sentiment' in col.lower() or 'label' in col.lower() or 'rating' in col.lower():
        df.rename(columns={col: 'sentiment'}, inplace=True)

# üßπ Clean and Filter Data
df.dropna(subset=['review', 'sentiment'], inplace=True)
df.drop_duplicates(inplace=True)
df['review'] = df['review'].astype(str)

# üßº Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_review'] = df['review'].apply(clean_text)

# üß† Sentiment Encoding
df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()
df['sentiment'] = df['sentiment'].map({
    'positive': 1, 'pos': 1, '1': 1, 'yes': 1,
    'negative': 0, 'neg': 0, '0': 0, 'no': 0
})
df.dropna(subset=['sentiment'], inplace=True)
df['sentiment'] = df['sentiment'].astype(int)

# ‚úÖ Show class distribution
print("\nüìä Sentiment Value Counts:")
print(df['sentiment'].value_counts())

# Check for at least two classes
if df['sentiment'].nunique() < 2:
    print("‚ùå ERROR: Dataset contains only one class. Please provide both positive and negative reviews.")
else:
    # üß† TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(df['clean_review']).toarray()
    y = df['sentiment'].values

    # üîÄ Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # üìä Sentiment Distribution Plot
    plt.figure(figsize=(6,4))
    sns.countplot(x=df['sentiment'])
    plt.title('Sentiment Distribution')
    plt.xticks([0,1], ['Negative', 'Positive'])
    plt.show()

    # ‚òÅÔ∏è Word Cloud Generator
    def show_wordcloud(data, title=None):
        text = ' '.join(data)
        if len(text.strip()) == 0:
            print(f"‚ö†Ô∏è No text available to generate word cloud for: {title}")
            return
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title)
        plt.show()

    # ‚òÅÔ∏è Generate Word Clouds
    if df[df['sentiment']==1].shape[0] > 0:
        show_wordcloud(df[df['sentiment']==1]['clean_review'], "Positive Reviews")
    if df[df['sentiment']==0].shape[0] > 0:
        show_wordcloud(df[df['sentiment']==0]['clean_review'], "Negative Reviews")

    # üìè Review Length Analysis
    df['review_len'] = df['clean_review'].apply(lambda x: len(x.split()))
    plt.figure(figsize=(6,4))
    sns.boxplot(x='sentiment', y='review_len', data=df)
    plt.title('Review Length vs Sentiment')
    plt.xticks([0,1], ['Negative', 'Positive'])
    plt.show()

    # ü§ñ Train Models
    models = {
        "Logistic Regression": LogisticRegression(),
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "SVM": SVC()
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\nüîç {name} Results:")
        print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        print(f"Precision: {precision_score(y_test, y_pred):.4f}")
        print(f"Recall: {recall_score(y_test, y_pred):.4f}")
        print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # üß™ Predict Sentiment for New Reviews
    final_model = LogisticRegression()
    final_model.fit(X_train, y_train)

    test_reviews = [
        "Worst product ever! Waste of money!",
        "Absolutely loved it! Great quality and delivery.",
        "Not worth the price. Disappointed.",
        "Amazing features and value for money!"
    ]

    cleaned = [clean_text(review) for review in test_reviews]
    vec = tfidf.transform(cleaned).toarray()
    preds = final_model.predict(vec)

    for review, pred in zip(test_reviews, preds):
        sentiment = 'Positive ‚úÖ' if pred == 1 else 'Negative ‚ùå'
        print(f"\nReview: {review}\nPredicted Sentiment: {sentiment}")


üßæ Dataset columns: Index(['review', 'rating'], dtype='object')

üìä Sentiment Value Counts:
sentiment
1    631
Name: count, dtype: int64
‚ùå ERROR: Dataset contains only one class. Please provide both positive and negative reviews.
