In [1]:
import pandas as pd

# Load all files one by one
df1 = pd.read_csv("C:\\Users\\LENOVO\\Downloads\\reviews_data_dump\\reviews_badminton\\data.csv")
df2 = pd.read_csv("C:\\Users\\LENOVO\\Downloads\\reviews_data_dump\\reviews_tawa\data.csv")
df3 = pd.read_csv("C:\\Users\\LENOVO\\Downloads\\reviews_data_dump\\reviews_tea\data.csv")

for i, df in enumerate([df1, df2, df3], start=1):
    print(f"\nFile {i}")
    print(df.shape)
    print(df.columns)



File 1
(8518, 8)
Index(['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes',
       'Down Votes', 'Month', 'Review text', 'Ratings'],
      dtype='object')

File 2
(2531, 8)
Index(['Reviewer_Name', 'Reviewer_Rating', 'Review_Title', 'Review_Text',
       'Place_of_Review', 'Date_of_Review', 'Up_Votes', 'Down_Votes'],
      dtype='object')

File 3
(9170, 8)
Index(['reviewer_name', 'reviewer_rating', 'review_title', 'review_text',
       'place_of_review', 'Date_of_review', 'up_votes', 'Down_votes'],
      dtype='object')


In [2]:
# Doing analysis on df1
df = df1.copy()

In [3]:
def rating_to_sentiment(ratings):
    if ratings >= 4:
        return 1   # Positive
    elif ratings <= 2:
        return 0   # Negative
    else:
        return None  # Neutral

df["sentiment"] = df["Ratings"].apply(rating_to_sentiment)

# Drop neutral reviews
df = df.dropna(subset=["sentiment"])
df["sentiment"] = df["sentiment"].astype(int)

In [4]:
df["text"] = df["Review Title"].fillna("") + " " + df["Review text"].fillna("")
df = df[["text", "sentiment"]]

In [5]:
import contractions
import re
import nltk
import string
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to WordNet POS tag"""
    return wordnet.NOUN

def clean(doc, stem=True):
    # 1. Expand contractions
    doc = contractions.fix(doc)

    # 2. Remove non-alphabetic characters
    doc = re.sub(r'[^a-zA-Z]', ' ', doc)

    # 3. Convert to lowercase
    doc = doc.lower()

    # 4. Tokenization
    tokens = word_tokenize(doc)

    # 5. Remove punctuation & stopwords
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stop_words]

    # 6. Stemming or Lemmatization
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]
    else:
        tokens = [lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens]

    return ' '.join(tokens)
df["clean_text"] = df["text"].apply(clean)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [9]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix

y_pred = model.predict(X_test_tfidf)

print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


F1 Score: 0.9542715349166963

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.49      0.62       215
           1       0.93      0.99      0.95      1366

    accuracy                           0.92      1581
   macro avg       0.88      0.74      0.79      1581
weighted avg       0.91      0.92      0.91      1581


Confusion Matrix:
 [[ 106  109]
 [  20 1346]]


In [10]:
negative_reviews = df[df["sentiment"] == 0]

from collections import Counter

words = " ".join(negative_reviews["clean_text"]).split()
common_words = Counter(words).most_common(20)

common_words


[('product', 424),
 ('read', 328),
 ('shuttl', 321),
 ('qualiti', 317),
 ('good', 231),
 ('bad', 204),
 ('poor', 129),
 ('worst', 124),
 ('money', 102),
 ('buy', 94),
 ('qualityread', 94),
 ('wast', 89),
 ('purchas', 86),
 ('one', 81),
 ('goodread', 77),
 ('day', 73),
 ('expect', 67),
 ('productread', 63),
 ('disappoint', 62),
 ('last', 60)]

In [11]:
import pickle

pickle.dump(model, open("sentiment_model.pkl", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))

In [12]:
def predict_sentiment(review):
    review_clean = clean(review)
    review_vec = tfidf.transform([review_clean])
    return "Positive" if model.predict(review_vec)[0] == 1 else "Negative"

predict_sentiment("Very bad quality, not worth the money")


'Negative'

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report

In [14]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression())
])


In [15]:
param_grid = [
    # Logistic Regression
    {
        "tfidf__max_features": [3000, 5000],
        "tfidf__ngram_range": [(1,1), (1,2)],
        "clf": [LogisticRegression(max_iter=1000)],
        "clf__C": [0.1, 1, 10]
    },

    # Linear SVM
    {
        "tfidf__max_features": [3000, 5000],
        "tfidf__ngram_range": [(1,1), (1,2)],
        "clf": [LinearSVC()],
        "clf__C": [0.1, 1, 10]
    },

    # Naive Bayes
    {
        "tfidf__max_features": [3000, 5000],
        "tfidf__ngram_range": [(1,1), (1,2)],
        "clf": [MultinomialNB()],
        "clf__alpha": [0.1, 0.5, 1.0]
    }
]

In [16]:
grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [21]:
print("Best Model:", grid.best_estimator_)
print("Best Parameters:", grid.best_params_)
print("Best CV F1 Score:", grid.best_score_)

Best Model: Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
                ('clf', LinearSVC(C=1))])
Best Parameters: {'clf': LinearSVC(), 'clf__C': 1, 'tfidf__max_features': 5000, 'tfidf__ngram_range': (1, 2)}
Best CV F1 Score: 0.9600599628228845


In [18]:
best_model = grid.best_estimator_

y_pred = best_model.predict(X_test)

print("Test F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test F1 Score: 0.9579590370104204

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.61      0.69       215
           1       0.94      0.98      0.96      1366

    accuracy                           0.93      1581
   macro avg       0.87      0.79      0.82      1581
weighted avg       0.92      0.93      0.92      1581



In [19]:
import pickle

pickle.dump(best_model, open("best_sentiment_model.pkl", "wb"))

In [20]:
def predict_sentiment(review):
    review_clean = clean(review)
    return "Positive" if best_model.predict([review_clean])[0] == 1 else "Negative"

predict_sentiment("Very poor quality and not worth the price")


'Negative'

In [22]:
%%writefile app.py
import streamlit as st
import pickle
import re
import nltk
import string
import contractions
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer

# -----------------------------
# NLTK Downloads (run once)
# -----------------------------
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# -----------------------------
# Load model (Pipeline)
# -----------------------------
model = pickle.load(open("best_sentiment_model.pkl", "rb"))

# -----------------------------
# Text Cleaning Setup
# -----------------------------
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    return wordnet.NOUN

def clean(doc, stem=True):
    doc = contractions.fix(doc)
    doc = re.sub(r"[^a-zA-Z]", " ", doc)
    doc = doc.lower()
    tokens = word_tokenize(doc)
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stop_words]

    if stem:
        tokens = [stemmer.stem(t) for t in tokens]
    else:
        tokens = [lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in tokens]

    return " ".join(tokens)

# -----------------------------
# Streamlit UI
# -----------------------------
st.set_page_config(page_title="Flipkart Sentiment Analysis", layout="centered")

st.title("üõí Flipkart Review Sentiment Analysis")
st.write("Enter a product review to predict whether it is **Positive or Negative**.")

review = st.text_area("‚úçÔ∏è Enter Review Text")

if st.button("Predict Sentiment"):
    if review.strip() == "":
        st.warning("Please enter a review.")
    else:
        clean_review = clean(review)
        prediction = model.predict([clean_review])[0]

        if prediction == 1:
            st.success("‚úÖ Positive Review")
        else:
            st.error("‚ùå Negative Review")

import os

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 8501))
    st.run(server_address="0.0.0.0", server_port=port)
    


Writing app.py


In [23]:
%%writefile requirements.txt
streamlit
scikit-learn
nltk
contractions

Writing requirements.txt
