# ============================
# NLP MODELS COMPARISON SCRIPT
BoW vs TF-IDF vs Word2Vec 
- All three are text-based embedding/feature extraction methods.
- GridSearchCV for C (LogisticRegression) and alpha (MultinomialNB)
    - C → hyperparameter of LogisticRegression
    - alpha → hyperparameter of MultinomialNB
    - GridSearchCV → finds the best values for these hyperparameters.
# ============================

In [1]:
# ============================
# IMPORTING REQUIRED PACKAGES
# ============================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# For Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize   # if not installed: pip install nltk
import nltk
nltk.download('punkt', quiet=True)   #punkt is NLTK’s pretrained sentence and word tokenizer.

True

In [2]:
# ===================
# 1. LOAD  DATASET 
# ===================

df = pd.read_csv(r"E:\3rd YEAR CSE(AI-ML)\5thsem\Machine Learning\ISMAIL\APPLE_iPhone_SE.csv")  
print(df.head())

texts = df["Reviews"].astype(str).values
labels = df["Ratings"].values

   Ratings         Comment                                            Reviews
0        5          Super!  Great camera for pics and videos Battery life ...
1        5       Must buy!  Great device. Let me tell the Pros..1. Superb ...
2        5   Great product  Who all loves older size i.e., 4.7 inch type s...
3        5  Simply awesome  This iPhone SE is the best phone ever you get....
4        5  Classy product  This is my second iphone after iphone 4s. I’ve...


In [3]:
# ======================
# 2. TRAIN / TEST SPLIT
# ======================

X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# Text: "I love NLP"
    - ngram (1,1): "I", "love", "NLP" → 3 features
    - ngram (1,3): "I", "love", "NLP", "I love", "love NLP", "I love NLP" → 6+ features

More n-grams = more features = bigger model = higher risk of overfitting.

In [4]:
# =====================================
# 3. FEATURE EXPLOSION: ngram_range=(1,1) vs (1,3)
# =====================================

print("\n=== Feature Explosion Demo with CountVectorizer ===")
bow_11 = CountVectorizer(ngram_range=(1, 1))
X_train_bow_11 = bow_11.fit_transform(X_train_text)
print("BoW ngram_range=(1,1) -> shape:", X_train_bow_11.shape)

bow_13 = CountVectorizer(ngram_range=(1, 3))
X_train_bow_13 = bow_13.fit_transform(X_train_text)
print("BoW ngram_range=(1,3) -> shape:", X_train_bow_13.shape)
print("Note: Same samples, many more features -> feature explosion, risk of overfitting.\n")



=== Feature Explosion Demo with CountVectorizer ===
BoW ngram_range=(1,1) -> shape: (7770, 6808)
BoW ngram_range=(1,3) -> shape: (7770, 142178)
Note: Same samples, many more features -> feature explosion, risk of overfitting.



In [5]:
# =====================================
# 4. GRIDSEARCHCV: LOGISTIC REGRESSION & MULTINOMIAL NB
#    WITH BoW AND TF-IDF
#    BoW / TF-IDF → simple statistical embeddings
# =====================================

# --------- (A) Logistic Regression + CountVectorizer(BOW) ---------
pipe_lr_bow = Pipeline([
    ("vect", CountVectorizer()),   # default ngram_range=(1,1)
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear"))
])

param_grid_lr_bow = {
    "vect__ngram_range": [(1,1), (1,2), (1,3)],   # to observe effect of higher n-grams
    "clf__C": [0.01, 0.1, 1, 10, 100]
}

grid_lr_bow = GridSearchCV(
    estimator=pipe_lr_bow,
    param_grid=param_grid_lr_bow,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

print("=== GridSearchCV: Logistic Regression + BoW ===")
grid_lr_bow.fit(X_train_text, y_train)
print("Best params (LR + BoW):", grid_lr_bow.best_params_)
print("Best CV accuracy:", grid_lr_bow.best_score_)

best_lr_bow = grid_lr_bow.best_estimator_
y_pred_lr_bow = best_lr_bow.predict(X_test_text)
print("Test accuracy (LR + BoW):", accuracy_score(y_test, y_pred_lr_bow))
print("Classification report (LR + BoW):\n", classification_report(y_test, y_pred_lr_bow))


# --------- (B) Logistic Regression + TfidfVectorizer(TF-IDF) ---------
pipe_lr_tfidf = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear"))
])

param_grid_lr_tfidf = {
    "vect__ngram_range": [(1,1), (1,2), (1,3)],
    "clf__C": [0.01, 0.1, 1, 10, 100]
}

grid_lr_tfidf = GridSearchCV(
    estimator=pipe_lr_tfidf,
    param_grid=param_grid_lr_tfidf,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

print("\n=== GridSearchCV: Logistic Regression + TF-IDF ===")
grid_lr_tfidf.fit(X_train_text, y_train)
print("Best params (LR + TF-IDF):", grid_lr_tfidf.best_params_)
print("Best CV accuracy:", grid_lr_tfidf.best_score_)

best_lr_tfidf = grid_lr_tfidf.best_estimator_
y_pred_lr_tfidf = best_lr_tfidf.predict(X_test_text)
print("Test accuracy (LR + TF-IDF):", accuracy_score(y_test, y_pred_lr_tfidf))
print("Classification report (LR + TF-IDF):\n", classification_report(y_test, y_pred_lr_tfidf))


# --------- (C) MultinomialNB + CountVectorizer(BOW) ---------
pipe_nb_bow = Pipeline([
    ("vect", CountVectorizer()),
    ("clf", MultinomialNB())
])

param_grid_nb_bow = {
    "vect__ngram_range": [(1,1), (1,2), (1,3)],
    "clf__alpha": [0.1, 0.5, 1.0, 5.0, 10.0]
}

grid_nb_bow = GridSearchCV(
    estimator=pipe_nb_bow,
    param_grid=param_grid_nb_bow,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

print("\n=== GridSearchCV: MultinomialNB + BoW ===")
grid_nb_bow.fit(X_train_text, y_train)
print("Best params (NB + BoW):", grid_nb_bow.best_params_)
print("Best CV accuracy:", grid_nb_bow.best_score_)

best_nb_bow = grid_nb_bow.best_estimator_
y_pred_nb_bow = best_nb_bow.predict(X_test_text)
print("Test accuracy (NB + BoW):", accuracy_score(y_test, y_pred_nb_bow))
print("Classification report (NB + BoW):\n", classification_report(y_test, y_pred_nb_bow))


# --------- (D) MultinomialNB + TfidfVectorizer(TF-IDF) ---------
pipe_nb_tfidf = Pipeline([
    ("vect", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

param_grid_nb_tfidf = {
    "vect__ngram_range": [(1,1), (1,2), (1,3)],
    "clf__alpha": [0.1, 0.5, 1.0, 5.0, 10.0]
}

grid_nb_tfidf = GridSearchCV(
    estimator=pipe_nb_tfidf,
    param_grid=param_grid_nb_tfidf,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

print("\n=== GridSearchCV: MultinomialNB + TF-IDF ===")
grid_nb_tfidf.fit(X_train_text, y_train)
print("Best params (NB + TF-IDF):", grid_nb_tfidf.best_params_)
print("Best CV accuracy:", grid_nb_tfidf.best_score_)

best_nb_tfidf = grid_nb_tfidf.best_estimator_
y_pred_nb_tfidf = best_nb_tfidf.predict(X_test_text)
print("Test accuracy (NB + TF-IDF):", accuracy_score(y_test, y_pred_nb_tfidf))
print("Classification report (NB + TF-IDF):\n", classification_report(y_test, y_pred_nb_tfidf))


=== GridSearchCV: Logistic Regression + BoW ===
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params (LR + BoW): {'clf__C': 0.1, 'vect__ngram_range': (1, 3)}
Best CV accuracy: 0.7120978120978121
Test accuracy (LR + BoW): 0.7159032424086464
Classification report (LR + BoW):
               precision    recall  f1-score   support

           1       0.58      0.34      0.43        95
           2       0.50      0.03      0.05        40
           3       0.38      0.05      0.08       107
           4       0.34      0.12      0.17       343
           5       0.75      0.97      0.84      1358

    accuracy                           0.72      1943
   macro avg       0.51      0.30      0.31      1943
weighted avg       0.64      0.72      0.65      1943


=== GridSearchCV: Logistic Regression + TF-IDF ===
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params (LR + TF-IDF): {'clf__C': 1, 'vect__ngram_range': (1, 1)}
Best CV accuracy: 0.7106821106821


# =====================================
- CountVectorizer/TfidfVectorizer → scikit-learn pipeline (compatible with GridSearchCV)
- Word2Vec → gensim model → not compatible with GridSearchCV pipelines

MultinomialNB requires:
   - non-negative values
   - word counts or frequencies

But Word2Vec gives:
   - dense vectors
   - negative values
   - continuous semantic embeddings
# =====================================

In [6]:
# =====================================
# 5. WORD2VEC AVERAGED EMBEDDINGS + LOGISTIC REGRESSION
#    Word2Vec → learned embeddings using CBOW or Skip-Gram (sg=1)

    #Because Word2Vec already produces numeric vectors → so only the classifier (LR) needs hyperparameter tuning.
    #Naive Bayes cannot be used on Word2Vec vectors.
# =====================================


print("\n=== Word2Vec Averaged Embeddings + Logistic Regression ===")

# ---- Tokenize sentences ----
X_train_tokens = [word_tokenize(doc.lower()) for doc in X_train_text]
X_test_tokens = [word_tokenize(doc.lower()) for doc in X_test_text]

# ---- Train Word2Vec on training data ----
w2v_dim = 100
w2v_model = Word2Vec(
    sentences=X_train_tokens,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1  # skip-gram
)

# ---- Helper: sentence to averaged vector ----
def sentence_to_vec(tokens, model, dim):
    vecs = []
    for w in tokens:
        if w in model.wv:
            vecs.append(model.wv[w])
    if len(vecs) == 0:
        return np.zeros(dim)
    else:
        return np.mean(vecs, axis=0)

X_train_w2v = np.array([sentence_to_vec(tokens, w2v_model, w2v_dim) for tokens in X_train_tokens])
X_test_w2v = np.array([sentence_to_vec(tokens, w2v_model, w2v_dim) for tokens in X_test_tokens])

# ---- GridSearchCV for Logistic Regression on Word2Vec features ----
lr_w2v = LogisticRegression(max_iter=1000, solver="liblinear")

param_grid_lr_w2v = {
    "C": [0.01, 0.1, 1, 10, 100]
}

grid_lr_w2v = GridSearchCV(
    estimator=lr_w2v,
    param_grid=param_grid_lr_w2v,
    cv=3,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_lr_w2v.fit(X_train_w2v, y_train)
print("Best params (LR + Word2Vec):", grid_lr_w2v.best_params_)
print("Best CV accuracy:", grid_lr_w2v.best_score_)

best_lr_w2v = grid_lr_w2v.best_estimator_
y_pred_w2v = best_lr_w2v.predict(X_test_w2v)
print("Test accuracy (LR + Word2Vec):", accuracy_score(y_test, y_pred_w2v))
print("Classification report (LR + Word2Vec):\n", classification_report(y_test, y_pred_w2v))


# =====================================
# 6. OPTIONAL: LINEARSVC FOR COMPARISON (BoW/TFIDF)
# =====================================

# Simple example: LinearSVC + TF-IDF (no grid search, just baseline)
pipe_svc_tfidf = Pipeline([
    ("vect", TfidfVectorizer(ngram_range=(1,2))),
    ("clf", LinearSVC(max_iter=10000))
])

pipe_svc_tfidf.fit(X_train_text, y_train)
y_pred_svc = pipe_svc_tfidf.predict(X_test_text)
print("\n=== LinearSVC + TF-IDF (baseline) ===")
print("Test accuracy (LinearSVC + TF-IDF):", accuracy_score(y_test, y_pred_svc))
print("Classification report (LinearSVC + TF-IDF):\n", classification_report(y_test, y_pred_svc))


=== Word2Vec Averaged Embeddings + Logistic Regression ===
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best params (LR + Word2Vec): {'C': 10}
Best CV accuracy: 0.705920205920206
Test accuracy (LR + Word2Vec): 0.6999485331960885
Classification report (LR + Word2Vec):
               precision    recall  f1-score   support

           1       0.39      0.21      0.27        95
           2       0.00      0.00      0.00        40
           3       0.19      0.03      0.05       107
           4       0.25      0.04      0.07       343
           5       0.73      0.97      0.83      1358

    accuracy                           0.70      1943
   macro avg       0.31      0.25      0.25      1943
weighted avg       0.58      0.70      0.61      1943


=== LinearSVC + TF-IDF (baseline) ===
Test accuracy (LinearSVC + TF-IDF): 0.716932578486876
Classification report (LinearSVC + TF-IDF):
               precision    recall  f1-score   support

           1       0.60      0.56