In [None]:
# ========================
# 1. Imports & Data Loading
# ========================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import torch

# ========================
# 2. NLTK Downloads
# ========================
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

# ========================
# 3. Load Data
# ========================
courses = pd.read_csv("Coursera_courses.csv")
reviews = pd.read_csv("Coursera_reviews.csv")

print("Courses:", courses.shape)
print("Reviews:", reviews.shape)

# ========================
# 4. Preprocess Reviews
# ========================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

reviews["clean_review"] = reviews["reviews"].astype(str).apply(clean_text)

# Label from rating
def label_from_rating(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    else: return 0

reviews["label"] = reviews["rating"].apply(label_from_rating)

# ========================
# 5. Train/Test Split (Full dataset)
# ========================
X = reviews["clean_review"]
y = reviews["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ========================
# 6. TF-IDF Vectorization (Full dataset)
# ========================
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# ========================
# 7. Model Training (Traditional ML)
# ========================
models = {
    "LogReg": LogisticRegression(max_iter=500),
    "NaiveBayes": MultinomialNB(),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "SVM": LinearSVC()
}

f1_scores = {}

# Train traditional ML models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average="macro")
    f1_scores[name] = f1
    print(f"{name} F1-score: {f1:.4f}")

# ========================
# 8. Train XGBoost separately with label mapping
# ========================
y_train_xgb = y_train.map({-1: 0, 0: 1, 1: 2})
y_test_xgb = y_test.map({-1: 0, 0: 1, 1: 2})

xgb_model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
print("\nTraining XGBoost...")
xgb_model.fit(X_train_tfidf, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Map back to original labels
y_pred_xgb_orig = pd.Series(y_pred_xgb).map({0: -1, 1: 0, 2: 1})
f1_xgb = f1_score(y_test, y_pred_xgb_orig, average="macro")
f1_scores["XGBoost"] = f1_xgb
print(f"XGBoost F1-score: {f1_xgb:.4f}")

# ========================
# 9. Lexicon-Based (VADER)
# ========================
sid = SentimentIntensityAnalyzer()

def vader_label(text):
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05: return 1
    elif score <= -0.05: return -1
    else: return 0

y_pred_vader = X_test.apply(vader_label)
f1_vader = f1_score(y_test, y_pred_vader, average="macro")
f1_scores["VADER"] = f1_vader
print(f"\nVADER F1-score: {f1_vader:.4f}")

# ========================
# 10. Transformer (BERT) – small subset for speed
# ========================
# 10. Transformer (BERT) – fixed label mapping
# ========================
device = 0 if torch.cuda.is_available() else -1
bert_classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=device
)

# Twitter Roberta labels: LABEL_0 = negative, LABEL_1 = neutral, LABEL_2 = positive
label_map = {"LABEL_0": -1, "LABEL_1": 0, "LABEL_2": 1}

sample_texts = list(X_test[:10])
bert_preds = []
for text in sample_texts:
    result = bert_classifier(text[:512])[0]
    bert_preds.append(label_map[result["label"]])

bert_f1 = f1_score(y_test[:10], bert_preds, average="macro")
f1_scores["BERT"] = bert_f1
print(f"\nBERT F1-score (10 samples): {bert_f1:.4f}")


# ========================
# 11. Ensemble (Voting Classifier)
# ========================
top_models = [
    ("LogReg", models["LogReg"]),
    ("NaiveBayes", models["NaiveBayes"]),
    ("RandomForest", models["RandomForest"]),
    ("SVM", models["SVM"])
]

voting_clf = VotingClassifier(estimators=top_models, voting="hard")
voting_clf.fit(X_train_tfidf, y_train)
y_pred_ensemble = voting_clf.predict(X_test_tfidf)
f1_ensemble = f1_score(y_test, y_pred_ensemble, average="macro")
f1_scores["Ensemble"] = f1_ensemble
print(f"\nEnsemble (Voting) F1-score: {f1_ensemble:.4f}")

# ========================
# 12. Compare All Models
# ========================
print("\n=== Final F1-score comparison ===")
for model, score in f1_scores.items():
    print(f"{model}: {score:.4f}")

best_model = max(f1_scores, key=f1_scores.get)
print(f"\nBest model based on F1-score: {best_model}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Courses: (623, 4)
Reviews: (1454711, 5)


In [3]:
# ========================
# Cell 1: Choose Best Model
# ========================

# Print comparison again (for clarity)
print("\n=== Final F1-score comparison ===")
for model, score in f1_scores.items():
    print(f"{model}: {score:.4f}")

best_model_name = max(f1_scores, key=f1_scores.get)

# Select corresponding trained model
if best_model_name == "XGBoost":
    final_model = xgb_model
elif best_model_name == "VADER":
    final_model = "VADER"  # special case
elif best_model_name == "BERT":
    final_model = bert_classifier
elif best_model_name == "Ensemble":
    final_model = voting_clf
else:
    final_model = models[best_model_name]

print(f"\nBest Model Selected: {best_model_name}")



=== Final F1-score comparison ===
LogReg: 0.3424
NaiveBayes: 0.3424
RandomForest: 0.9067
SVM: 0.9252
XGBoost: 0.8989
VADER: 0.3963
BERT: 0.2667
Ensemble: 0.9252

Best Model Selected: SVM


In [5]:
# ========================
# Cell 2: Sentiment Demo & Course Ranking
# ========================

def predict_sentiment(text):
    text_clean = clean_text(text)

    if final_model == "VADER":
        return vader_label(text_clean)

    elif best_model_name == "BERT":
        result = final_model(text_clean[:512])[0]
        return label_map[result["label"]]

    elif best_model_name == "XGBoost":
        tfidf_vec = tfidf.transform([text_clean])
        pred = final_model.predict(tfidf_vec)[0]
        return {-1: -1, 0: 0, 1: 1, 2: 1}[pred]  # mapping safe

    else:  # ML/Ensemble
        tfidf_vec = tfidf.transform([text_clean])
        return final_model.predict(tfidf_vec)[0]


# ===== 1. User Review Sentiment =====
user_text = input("Enter a review text: ")
pred_label = predict_sentiment(user_text)
label_map_text = {1: "Positive", 0: "Neutral", -1: "Negative"}
print(f"\nSentiment: {label_map_text[pred_label]} ({pred_label})")


# ===== 2. Keyword → Course Ranking =====
keyword = input("\nEnter a course keyword (e.g., 'java'): ").lower()

# Compute sentiment for all reviews using best model
reviews["pred_label"] = reviews["clean_review"].apply(predict_sentiment)

# Aggregate sentiment score per course
course_sentiment = (
    reviews.groupby("course_id")["pred_label"].mean().reset_index()
)

# Merge with course info
course_sentiment = course_sentiment.merge(
    courses[["course_id", "course_title"]], on="course_id", how="left"
)

# Filter by keyword in title
filtered = course_sentiment[
    course_sentiment["course_title"].str.lower().str.contains(keyword)
]

# Sort by sentiment score descending
filtered_sorted = filtered.sort_values("pred_label", ascending=False)

print(f"\nTop courses for keyword '{keyword}' sorted by sentiment score:\n")
for _, row in filtered_sorted.iterrows():
    print(f"{row['course_title']}  --> Sentiment Score: {row['pred_label']:.2f}")


Enter a review text:  this is outdated data



Sentiment: Positive (1)



Enter a course keyword (e.g., 'java'):  python


KeyError: "['course_title'] not in index"