In [2]:
# ========================
# 1. Imports & Data Loading
# ========================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import torch

# ========================
# 2. NLTK Downloads
# ========================
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

# ========================
# 3. Load Data
# ========================
courses = pd.read_csv("Coursera_courses.csv")
reviews = pd.read_csv("Coursera_reviews.csv")

print("Courses:", courses.shape)
print("Reviews:", reviews.shape)

# ========================
# 4. Preprocess Reviews
# ========================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

reviews["clean_review"] = reviews["reviews"].astype(str).apply(clean_text)

# Label from rating
def label_from_rating(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    else: return 0

reviews["label"] = reviews["rating"].apply(label_from_rating)

# ========================
# 5. Train/Test Split (smaller sample for testing)
# ========================
# Using a subset to avoid freezing
sample_size = 50000
X = reviews["clean_review"][:sample_size]
y = reviews["label"][:sample_size]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========================
# 6. TF-IDF Vectorization
# ========================
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ========================
# 7. Model Training (Traditional ML)
# ========================
models = {
    "LogReg": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=100),  # reduced for speed
    "SVM": LinearSVC(),
    "XGBoost": xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
}

f1_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average="macro")
    f1_scores[name] = f1
    print(f"{name} F1-score: {f1:.4f}")

# ========================
# 8. Lexicon-Based (VADER)
# ========================
sid = SentimentIntensityAnalyzer()

def vader_label(text):
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05: return 1
    elif score <= -0.05: return -1
    else: return 0

y_pred_vader = X_test.apply(vader_label)
f1_vader = f1_score(y_test, y_pred_vader, average="macro")
f1_scores["VADER"] = f1_vader
print(f"\nVADER F1-score: {f1_vader:.4f}")

# ========================
# 9. Transformer (BERT)
# ========================
device = 0 if torch.cuda.is_available() else -1
bert_classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=device
)

label_map = {"negative": -1, "neutral": 0, "positive": 1}

# small subset for speed
sample_texts = list(X_test[:10])
bert_preds = []
for text in sample_texts:
    result = bert_classifier(text[:512])[0]
    bert_preds.append(label_map[result["label"]])

bert_f1 = f1_score(y_test[:10], bert_preds, average="macro")
f1_scores["BERT"] = bert_f1
print(f"\nBERT F1-score (10 samples): {bert_f1:.4f}")

# ========================
# 10. Ensemble (Voting Classifier)
# ========================
top_models = [
    ("LogReg", models["LogReg"]),
    ("NaiveBayes", models["NaiveBayes"]),
    ("RandomForest", models["RandomForest"]),
    ("SVM", models["SVM"])
]

voting_clf = VotingClassifier(estimators=top_models, voting="hard")
voting_clf.fit(X_train_tfidf, y_train)
y_pred_ensemble = voting_clf.predict(X_test_tfidf)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average="macro")
f1_scores["Ensemble"] = f1_ensemble
print(f"\nEnsemble (Voting) F1-score: {f1_ensemble:.4f}")

# ========================
# 11. Compare All Models
# ========================
print("\n=== Final F1-score comparison ===")
for model, score in f1_scores.items():
    print(f"{model}: {score:.4f}")

best_model = max(f1_scores, key=f1_scores.get)
print(f"\nBest model based on F1-score: {best_model}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Courses: (623, 4)
Reviews: (1246073, 5)

Training LogReg...
LogReg F1-score: 0.4310

Training NaiveBayes...
NaiveBayes F1-score: 0.3245

Training RandomForest...
RandomForest F1-score: 0.9136

Training SVM...
SVM F1-score: 0.8633

Training XGBoost...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [-1  0  1]

In [3]:
# ========================
# 1. Imports & Data Loading
# ========================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import torch

# ========================
# 2. NLTK Downloads
# ========================
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

# ========================
# 3. Load Data
# ========================
courses = pd.read_csv("Coursera_courses.csv")
reviews = pd.read_csv("Coursera_reviews.csv")

print("Courses:", courses.shape)
print("Reviews:", reviews.shape)

# ========================
# 4. Preprocess Reviews
# ========================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

reviews["clean_review"] = reviews["reviews"].astype(str).apply(clean_text)

# Label from rating
def label_from_rating(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    else: return 0

reviews["label"] = reviews["rating"].apply(label_from_rating)

# ========================
# 5. Train/Test Split (only 100 reviews for testing)
# ========================
sample_size = 100
X = reviews["clean_review"][:sample_size]
y = reviews["label"][:sample_size]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ========================
# 6. TF-IDF Vectorization
# ========================
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# ========================
# 7. Traditional ML Models
# ========================
models = {
    "LogReg": LogisticRegression(max_iter=500),
    "NaiveBayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=50),
    "SVM": LinearSVC()
}

f1_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average="macro")
    f1_scores[name] = f1
    print(f"{name} F1-score: {f1:.4f}")

# ========================
# 8. XGBoost (with label mapping)
# ========================
y_train_xgb = y_train.map({-1: 0, 0: 1, 1: 2})
y_test_xgb = y_test.map({-1: 0, 0: 1, 1: 2})

xgb_model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
print("\nTraining XGBoost...")
xgb_model.fit(X_train_tfidf, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Map back to original labels
y_pred_xgb_orig = pd.Series(y_pred_xgb).map({0: -1, 1: 0, 2: 1})
f1_xgb = f1_score(y_test, y_pred_xgb_orig, average="macro")
f1_scores["XGBoost"] = f1_xgb
print(f"XGBoost F1-score: {f1_xgb:.4f}")

# ========================
# 9. VADER
# ========================
sid = SentimentIntensityAnalyzer()

def vader_label(text):
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05: return 1
    elif score <= -0.05: return -1
    else: return 0

y_pred_vader = X_test.apply(vader_label)
f1_vader = f1_score(y_test, y_pred_vader, average="macro")
f1_scores["VADER"] = f1_vader
print(f"\nVADER F1-score: {f1_vader:.4f}")

# ========================
# 10. BERT (very small subset)
# ========================
device = 0 if torch.cuda.is_available() else -1
bert_classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=device
)

label_map = {"negative": -1, "neutral": 0, "positive": 1}
sample_texts = list(X_test[:5])   # only 5 texts
bert_preds = []
for text in sample_texts:
    result = bert_classifier(text[:512])[0]
    bert_preds.append(label_map[result["label"]])

bert_f1 = f1_score(y_test[:5], bert_preds, average="macro")
f1_scores["BERT"] = bert_f1
print(f"\nBERT F1-score (5 samples): {bert_f1:.4f}")

# ========================
# 11. Ensemble
# ========================
top_models = [
    ("LogReg", models["LogReg"]),
    ("NaiveBayes", models["NaiveBayes"]),
    ("RandomForest", models["RandomForest"]),
    ("SVM", models["SVM"])
]

voting_clf = VotingClassifier(estimators=top_models, voting="hard")
voting_clf.fit(X_train_tfidf, y_train)
y_pred_ensemble = voting_clf.predict(X_test_tfidf)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average="macro")
f1_scores["Ensemble"] = f1_ensemble
print(f"\nEnsemble (Voting) F1-score: {f1_ensemble:.4f}")

# ========================
# 12. Compare All Models
# ========================
print("\n=== Final F1-score comparison ===")
for model, score in f1_scores.items():
    print(f"{model}: {score:.4f}")

best_model = max(f1_scores, key=f1_scores.get)
print(f"\nBest model based on F1-score: {best_model}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Courses: (623, 4)
Reviews: (1454711, 5)

Training LogReg...
LogReg F1-score: 0.4872

Training NaiveBayes...
NaiveBayes F1-score: 0.4872

Training RandomForest...
RandomForest F1-score: 0.4872

Training SVM...
SVM F1-score: 0.4872

Training XGBoost...


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [0 2]