In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_csv("/content/drive/MyDrive/datasets/MYDATASET.csv")
print(df.head())       # show first 5 rows
print(df.columns)


  review_id product_id                  product_name  username  rating  \
0     R0001       P041  Bose Electronics Blender4112  user_058       4   
1     R0002       P006        Samsung Sports Book588  user_024       3   
2     R0003       P036       Adidas Coffee Cream3711  user_115       1   
3     R0004       P022    Philips Wireless Phone5614  user_027       4   
4     R0005       P003          Apple Home Cream6301  user_021       5   

                           review_text sentiment          review_date  \
0  Excellent quality and fast delivery  Positive  2024-11-23 00:00:00   
1               Decent product overall   Neutral  2023-08-27 00:00:00   
2           Poor quality, disappointed  Negative  2025-02-16 00:00:00   
3  Excellent quality and fast delivery  Positive  2024-01-03 00:00:00   
4                      Would buy again  Positive  2025-06-25 00:00:00   

   helpful_votes  verified_purchase  
0             94              False  
1             64               True  
2 

In [4]:
print(df['sentiment'].value_counts())

sentiment
Positive    5495
Negative    2414
Neutral     1591
Name: count, dtype: int64


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

linear svc


# ------------------------------------
# Step 1: Data Augmentation (extra negatives)
# ------------------------------------
extra_negatives = [
    "This is the worst thing ever",
    "Worst product I have bought",
    "Absolutely horrible experience",
    "Awful quality, very disappointed",
    "Terrible and pathetic service",
    "The worst purchase in my life",
    "Extremely dissatisfied and angry",
    "Completely useless and horrible",
    "Utterly appalling, one of the worst experiences ever.",
    "Disastrous beyond belief, completely intolerable.",
    "A catastrophe, nothing redeemable about it.",
    "Excruciatingly bad, I regret even trying it.",
    "Absolutely horrendous, a complete failure.",
    "Pathetic attempt, falls flat on every level.",
    "A dreadful mess, impossible to recommend.",
    "Painfully disappointing, worse than I imagined.",
    "Worthless and frustrating, a waste of time.",
    "An insult to quality, utterly unbearable.",
    "bad thing."
]
extra_positives = [
    "Absolutely phenomenal experience, beyond expectations",
    "A masterpiece, radiating brilliance in every aspect",
    "Pure excellence, I couldn’t have asked for more",
    "Incredibly uplifting, a once-in-a-lifetime experience",
    "Heartwarming and empowering — truly remarkable",
    "good experience",
    "good thing",
    "nice",
    "I love this product, it is amazing!",
    "love",
    "Absolutely phenomenal experience, beyond expectations.",
    "This left me speechless in the best way possible.",
    "A masterpiece, radiating brilliance in every aspect.",
    "A rare gem, truly delightful and inspiring.",
    "Outstanding performance, unmatched and flawless.",
    "Pure excellence, I couldn’t have asked for more.",
    "This fills me with immense joy and admiration.",
    "Incredibly uplifting, a once-in-a-lifetime experience.",
    "Radiates positivity and brilliance all around.",
    "Heartwarming and empowering — truly remarkable."
]
extra_neutrals = [
    "It was okay, nothing particularly special.",
    "Neither good nor bad, just average.",
    "It served its purpose, nothing more to add.",
    "An ordinary outcome, as expected.",
    "Not disappointing, but not impressive either.",
    "The experience was passable, fairly standard.",
    "It worked as intended, no surprises.",
    "Just another day, quite routine and plain.",
    "Neither thrilling nor dull, simply neutral.",
    "Met the minimum expectations, no complaints.",
    "It's okay , not bad",
    "hi", "hii", "hiii", "hello", "hey", "yo", "sup", "good morning", "good evening", "good afternoon"
]
df_extra = pd.DataFrame({
    'review_text': extra_negatives + extra_positives + extra_neutrals,
    'sentiment': (['Negative'] * len(extra_negatives)) +
                 (['Positive'] * len(extra_positives)) +
                 (['Neutral'] * len(extra_neutrals))

})

df = pd.concat([df, df_extra], ignore_index=True)


# ------------------------------------
# Step 2: Preprocessing (same as before)
# ------------------------------------
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')   # optional but helps with lemmatization
nltk.download('stopwords') # also needed since you’re using stopwords


def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

df["clean_text"] = df["review_text"].apply(preprocess)


# ------------------------------------
# Step 3: Vectorizer with word + char n-grams
# ------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

word_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
char_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=20000)

vectorizer = FeatureUnion([("word", word_vectorizer), ("char", char_vectorizer)])


# ------------------------------------
# Step 4: Train-test split + Oversampling
# ------------------------------------
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_vec, y_train)


# ------------------------------------
# Step 5: Train Linear SVM
# ------------------------------------
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

model = LinearSVC(class_weight="balanced", random_state=42)
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


# ------------------------------------
# Step 6: Prediction Function
# ------------------------------------
def predict_sentiment(text):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])
    return model.predict(vec)[0]


# ------------------------------------
# Step 7: Custom Tests
# ------------------------------------
print(predict_sentiment("I love this product, it is amazing!"))   # Positive
print(predict_sentiment("This is the worst thing ever"))          # Negative
print(predict_sentiment("It was okay, not too bad"))              # Neutral
print(predict_sentiment("Terrible experience, very bad"))         # Negative
print(predict_sentiment("Absolutely fantastic, loved it"))
print(predict_sentiment("Hii"))

def predict_sentiment(text):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])
    return model.predict(vec)[0]

# Test again
print(predict_sentiment("I love this product, it is amazing!"))   # Positive
print(predict_sentiment("This is the nasty thing ever"))          # Negative
print(predict_sentiment("It was okay, not too bad"))
print(predict_sentiment("hii"))
print(predict_sentiment("nice"))
import joblib

# Save model & vectorizer
joblib.dump(model, "senticore_model.pkl")
joblib.dump(vectorizer, "senticore_vectorizer.pkl")

print("✅ Model and vectorizer saved!")
# Load model & vectorizer
model = joblib.load("senticore_model.pkl")
vectorizer = joblib.load("senticore_vectorizer.pkl")

print("✅ Model and vectorizer loaded!")
from google.colab import files
files.download("senticore_model.pkl")
files.download("senticore_vectorizer.pkl")

Random forest

In [52]:
# ------------------------------------
# Step 1: Data Augmentation (extra negatives)
# ------------------------------------
extra_negatives = [
    "This is the worst thing ever",
    "Worst product I have bought",
    "Absolutely horrible experience",
    "Awful quality, very disappointed",
    "Terrible and pathetic service",
    "The worst purchase in my life",
    "Extremely dissatisfied and angry",
    "false",
    "Completely useless and horrible",
    "Utterly appalling, one of the worst experiences ever.",
    "Disastrous beyond belief, completely intolerable.",
    "A catastrophe, nothing redeemable about it.",
    "Excruciatingly bad, I regret even trying it.",
    "Absolutely horrendous, a complete failure.",
    "Pathetic attempt, falls flat on every level.",
    "A dreadful mess, impossible to recommend.",
    "Painfully disappointing, worse than I imagined.",
    "Worthless and frustrating, a waste of time.",
    "An insult to quality, utterly unbearable.",
    "bad thing.",
    "nasty.",
    "poor."
]
extra_positives = [
    "Absolutely phenomenal experience, beyond expectations",
    "A masterpiece, radiating brilliance in every aspect",
    "Pure excellence, I couldn’t have asked for more",
    "Incredibly uplifting, a once-in-a-lifetime experience",
    "Heartwarming and empowering — truly remarkable",
    "good experience",
    "best thing",
    "true",
    "I love this product, it is amazing!",
    "love",
    "great",
    "nice",
    "amazing",
    "negative",
    "Absolutely phenomenal experience, beyond expectations.",
    "This left me speechless in the best way possible.",
    "A masterpiece, radiating brilliance in every aspect.",
    "A rare gem, truly delightful and inspiring.",
    "Outstanding performance, unmatched and flawless.",
    "Pure excellence, I couldn’t have asked for more.",
    "This fills me with immense joy and admiration.",
    "Incredibly uplifting, a once-in-a-lifetime experience.",
    "Radiates positivity and brilliance all around.",
    "Heartwarming and empowering — truly remarkable."
]
extra_neutrals = [
    "It was okay, nothing particularly special.",
    "Neither good nor bad, just average.",
    "It served its purpose, nothing more to add.",
    "An ordinary outcome, as expected.",
    "Not disappointing, but not impressive either.",
    "The experience was passable, fairly standard.",
    "It worked as intended, no surprises.",
    "Just another day, quite routine and plain.",
    "Neither thrilling nor dull, simply neutral.",
    "Met the minimum expectations, no complaints.",
    "It's okay , not bad",
    "hi", "hii", "hiii", "hello", "hey", "yo", "sup", "good morning", "good evening", "good afternoon"
]
df_extra = pd.DataFrame({
    'review_text': extra_negatives + extra_positives + extra_neutrals,
    'sentiment': (['Negative'] * len(extra_negatives)) +
                 (['Positive'] * len(extra_positives)) +
                 (['Neutral'] * len(extra_neutrals))

})

df = pd.concat([df, df_extra], ignore_index=True)


# ------------------------------------
# Step 2: Preprocessing (same as before)
# ------------------------------------
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')   # optional but helps with lemmatization
nltk.download('stopwords') # also needed since you’re using stopwords


def preprocess(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

df["clean_text"] = df["review_text"].apply(preprocess)


# ------------------------------------
# Step 3: Vectorizer with word + char n-grams
# ------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion

word_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
char_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=20000)

vectorizer = FeatureUnion([("word", word_vectorizer), ("char", char_vectorizer)])


# ------------------------------------
# Step 4: Train-test split + Oversampling
# ------------------------------------
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_vec, y_train)


# ------------------------------------
# Step 5: Train Linear SVM
# ------------------------------------
# Step 5: Train Decision Tree
# ------------------------------------
# ------------------------------------
# Step 5: Train Random Forest
# ------------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = RandomForestClassifier(
    n_estimators=300,        # number of trees
    max_depth=50,            # prevent overfitting
    class_weight="balanced", # handle class imbalance
    random_state=42,
    n_jobs=-1                # use all CPU cores
)

model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


# ------------------------------------
# Step 6: Prediction Function
# ------------------------------------
def predict_sentiment(text):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])

    pred = model.predict(vec)[0]
    probs = model.predict_proba(vec)[0]

    # map probabilities to class labels
    prob_dict = {
        cls.lower(): round(float(p * 100), 2)
        for cls, p in zip(model.classes_, probs)
    }

    return pred, prob_dict



# ------------------------------------
# Step 7: Custom Tests
# ------------------------------------
print(predict_sentiment("I love this product, it is amazing!"))   # Positive
print(predict_sentiment("This is the worst thing ever"))          # Negative
print(predict_sentiment("It was okay, not too bad"))              # Neutral
print(predict_sentiment("Terrible experience, very bad"))         # Negative
print(predict_sentiment("Absolutely fantastic, loved it"))
print(predict_sentiment("Hii"))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

    Negative       1.00      0.92      0.96       547
     Neutral       0.90      1.00      0.94       386
    Positive       1.00      1.00      1.00      1164

    accuracy                           0.98      2097
   macro avg       0.96      0.97      0.97      2097
weighted avg       0.98      0.98      0.98      2097

[[ 502   45    0]
 [   1  385    0]
 [   0    0 1164]]
('Positive', {'negative': 0.0, 'neutral': 0.0, 'positive': 100.0})
('Negative', {'negative': 99.68, 'neutral': 0.27, 'positive': 0.05})
('Neutral', {'negative': 0.35, 'neutral': 99.07, 'positive': 0.57})
('Negative', {'negative': 90.17, 'neutral': 3.46, 'positive': 6.36})
('Positive', {'negative': 32.28, 'neutral': 14.05, 'positive': 53.67})
('Neutral', {'negative': 2.31, 'neutral': 95.63, 'positive': 2.07})


In [53]:
print(predict_sentiment("I love this product, it is amazing!"))



('Positive', {'negative': 0.0, 'neutral': 0.0, 'positive': 100.0})


In [54]:
def predict_sentiment_with_proba(text):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])

    # prediction
    pred = model.predict(vec)[0]

    # probabilities
    probs = model.predict_proba(vec)[0]
    prob_dict = {
        cls.lower(): round(float(p * 100), 2)
        for cls, p in zip(model.classes_, probs)
    }

    return pred, prob_dict
print(predict_sentiment("I love this product, it is amazing!"))



('Positive', {'negative': 0.0, 'neutral': 0.0, 'positive': 100.0})


In [56]:
print(predict_sentiment("nasty"))
print(predict_sentiment("great"))
print(predict_sentiment("amazing"))


('Negative', {'negative': 86.21, 'neutral': 12.98, 'positive': 0.81})
('Positive', {'negative': 1.14, 'neutral': 9.97, 'positive': 88.88})
('Positive', {'negative': 0.7, 'neutral': 6.74, 'positive': 92.56})


In [57]:
import joblib

# Save model & vectorizer
joblib.dump(model, "senticore_model.pkl")
joblib.dump(vectorizer, "senticore_vectorizer.pkl")

print("✅ Model and vectorizer saved!")

✅ Model and vectorizer saved!


In [58]:
model = joblib.load("senticore_model.pkl")
vectorizer = joblib.load("senticore_vectorizer.pkl")

print("✅ Model and vectorizer loaded!")

✅ Model and vectorizer loaded!


In [59]:
from google.colab import files
files.download("senticore_model.pkl")
files.download("senticore_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>