In [2]:
# ------------------------
# Imports & NLTK Downloads
# ------------------------
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import torch
import joblib

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

# ------------------------
# Load Data
# ------------------------
courses = pd.read_csv("Coursera_courses.csv")
reviews = pd.read_csv("Coursera_reviews.csv")

print("Courses:", courses.shape)
print("Reviews:", reviews.shape)

# ------------------------
# Preprocess Reviews
# ------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

reviews["clean_review"] = reviews["reviews"].astype(str).apply(clean_text)

# ------------------------
# Labels from rating
# ------------------------
def label_from_rating(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    else: return 0

reviews["label"] = reviews["rating"].apply(label_from_rating)

# ------------------------
# Train/Test Split (subset for speed)
# ------------------------
train_subset_size =5000  # adjust depending on your machine
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    reviews["clean_review"], reviews["label"], test_size=0.2, random_state=42
)

# Take subset for training
X_train, _, y_train, _ = train_test_split(
    X_train_full, y_train_full, train_size=min(train_subset_size, len(X_train_full)), random_state=42
)
X_test = X_test_full
y_test = y_test_full

# ------------------------
# TF-IDF Vectorization
# ------------------------
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ------------------------
# Train Traditional ML Models
# ------------------------
models = {
    "LogReg": LogisticRegression(max_iter=500),
    "NaiveBayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=50, n_jobs=-1),
    "SVM": LinearSVC(max_iter=1000)
}

f1_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average="macro")
    f1_scores[name] = f1
    print(f"{name} F1-score: {f1:.4f}")

# ------------------------
# Train XGBoost (subset)
# ------------------------
y_train_xgb = y_train.map({-1: 0, 0: 1, 1: 2})
y_test_xgb = y_test.map({-1: 0, 0: 1, 1: 2})

xgb_model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False, n_jobs=-1)
print("\nTraining XGBoost...")
xgb_model.fit(X_train_tfidf, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_tfidf)
y_pred_xgb_orig = pd.Series(y_pred_xgb).map({0: -1, 1: 0, 2: 1})
f1_scores["XGBoost"] = f1_score(y_test, y_pred_xgb_orig, average="macro")
print(f"XGBoost F1-score: {f1_scores['XGBoost']:.4f}")

# ------------------------
# Lexicon-Based (VADER)
# ------------------------
sid = SentimentIntensityAnalyzer()
def vader_label(text):
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05: return 1
    elif score <= -0.05: return -1
    else: return 0

y_pred_vader = X_test.apply(vader_label)
f1_scores["VADER"] = f1_score(y_test, y_pred_vader, average="macro")
print(f"VADER F1-score: {f1_scores['VADER']:.4f}")

# ------------------------
# Select Best Model
# ------------------------
best_model_name = max(f1_scores, key=f1_scores.get)

if best_model_name == "XGBoost":
    final_model = xgb_model
elif best_model_name == "VADER":
    final_model = "VADER"
else:
    final_model = models[best_model_name]

print(f"\nBest Model Selected: {best_model_name}")

# ------------------------
# Save trained model and TF-IDF for inference
# ------------------------
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
if final_model != "VADER":
    joblib.dump(final_model, "best_model.pkl")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Courses: (623, 4)
Reviews: (1454711, 5)

Training LogReg...
LogReg F1-score: 0.3267

Training NaiveBayes...
NaiveBayes F1-score: 0.3237

Training RandomForest...
RandomForest F1-score: 0.3432

Training SVM...
SVM F1-score: 0.4000

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost F1-score: 0.3815
VADER F1-score: 0.4148

Best Model Selected: VADER
