In [4]:
# ========================
# 1. Imports & Data Loading
# ========================
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import torch

# ========================
# 2. NLTK Downloads
# ========================
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

# ========================
# 3. Load Data
# ========================
courses = pd.read_csv("Coursera_courses.csv")
reviews = pd.read_csv("Coursera_reviews.csv")
print("Courses:", courses.shape)
print("Reviews:", reviews.shape)

# ========================
# 4. Preprocess Reviews
# ========================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

reviews["clean_review"] = reviews["reviews"].astype(str).apply(clean_text)

# Label from rating
def label_from_rating(r):
    if r >= 4: return 1
    elif r <= 2: return -1
    else: return 0

reviews["label"] = reviews["rating"].apply(label_from_rating)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jothi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Courses: (623, 4)
Reviews: (1454711, 5)


In [None]:
# ========================
# Cell 1: Train Logistic Regression
# ========================
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Sample for speed
sample_size = 5000
X = reviews["clean_review"][:sample_size]
y = reviews["label"][:sample_size]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_tfidf, y_train)

# Predict & F1
y_pred = logreg.predict(X_test_tfidf)
f1_logreg = f1_score(y_test, y_pred, average="macro")
print(f"Logistic Regression F1-score: {f1_logreg:.4f}")


In [16]:
# ========================
# Cell 2: Train Naive Bayes
# ========================
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

y_pred = nb.predict(X_test_tfidf)
f1_nb = f1_score(y_test, y_pred, average="macro")
print(f"Naive Bayes F1-score: {f1_nb:.4f}")


Naive Bayes F1-score: 0.5171


In [None]:
# ========================
# Cell 3: Train Random Forest
# ========================
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)

y_pred = rf.predict(X_test_tfidf)
f1_rf = f1_score(y_test, y_pred, average="macro")
print(f"Random Forest F1-score: {f1_rf:.4f}")


In [None]:
# ========================
# Cell 4: Train SVM
# ========================
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)

y_pred = svm.predict(X_test_tfidf)
f1_svm = f1_score(y_test, y_pred, average="macro")
print(f"SVM F1-score: {f1_svm:.4f}")


In [None]:
# ========================
# Cell 5: Train XGBoost
# ========================
import xgboost as xgb

y_train_xgb = y_train.map({-1:0, 0:1, 1:2})
y_test_xgb = y_test.map({-1:0, 0:1, 1:2})

xgb_model = xgb.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb_model.fit(X_train_tfidf, y_train_xgb)

y_pred_xgb = xgb_model.predict(X_test_tfidf)
y_pred_xgb_orig = pd.Series(y_pred_xgb).map({0:-1,1:0,2:1})
f1_xgb = f1_score(y_test, y_pred_xgb_orig, average="macro")
print(f"XGBoost F1-score: {f1_xgb:.4f}")


In [None]:
# ========================
# Cell 6: VADER
# ========================
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def vader_label(text):
    score = sid.polarity_scores(text)["compound"]
    if score >= 0.05: return 1
    elif score <= -0.05: return -1
    else: return 0

y_pred_vader = X_test.apply(vader_label)
f1_vader = f1_score(y_test, y_pred_vader, average="macro")
print(f"VADER F1-score: {f1_vader:.4f}")


In [None]:
# ========================
# Cell 7: BERT
# ========================
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
bert_classifier = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=device
)

label_map = {"LABEL_0":-1, "LABEL_1":0, "LABEL_2":1}

# Quick test on sample
sample_texts = list(X_test[:10])
bert_preds = [label_map[bert_classifier(t[:512])[0]["label"]] for t in sample_texts]
f1_bert = f1_score(y_test[:10], bert_preds, average="macro")
print(f"BERT F1-score (sample): {f1_bert:.4f}")


In [None]:
# ========================
# Cell 8: Choose Best Model
# ========================

f1_scores = {
    "LogReg": f1_logreg,
    "NaiveBayes": f1_nb,
    "RandomForest": f1_rf,
    "SVM": f1_svm,
    "XGBoost": f1_xgb,
    "VADER": f1_vader,
    "BERT": f1_bert
}

best_model_name = max(f1_scores, key=f1_scores.get)

if best_model_name == "XGBoost":
    final_model = xgb_model
elif best_model_name == "VADER":
    final_model = "VADER"
elif best_model_name == "BERT":
    final_model = bert_classifier
elif best_model_name in ["LogReg","NaiveBayes","RandomForest","SVM"]:
    final_model = {"LogReg":logreg,"NaiveBayes":nb,"RandomForest":rf,"SVM":svm}[best_model_name]

print(f" Best Model Selected: {best_model_name}")


In [None]:
# ========================
# Cell 9: Sentiment + Course Ranking
# ========================

def predict_sentiment(text):
    text_clean = clean_text(text)
    if final_model=="VADER":
        return vader_label(text_clean)
    elif best_model_name=="BERT":
        return label_map[final_model(text_clean[:512])[0]["label"]]
    elif best_model_name=="XGBoost":
        pred = final_model.predict(tfidf.transform([text_clean]))[0]
        return {-1:-1,0:0,1:1,2:1}[pred]
    else:
        return final_model.predict(tfidf.transform([text_clean]))[0]

# 1️⃣ Single Review
user_text = input("Enter a review text: ")
pred_label = predict_sentiment(user_text)
label_map_text = {1:"Positive",0:"Neutral",-1:"Negative"}
print(f"\nSentiment: {label_map_text[pred_label]} ({pred_label})")

# 2️⃣ Course Ranking
keyword = input("\nEnter a course keyword: ").lower()

# Predict sentiment for all reviews
reviews["pred_label"] = reviews["clean_review"].apply(predict_sentiment)

# Aggregate per course
course_summary = reviews.groupby("course_id")["pred_label"].agg(
    avg_score="mean",
    pos_pct=lambda x: (x==1).mean()*100,
    neu_pct=lambda x: (x==0).mean()*100,
    neg_pct=lambda x: (x==-1).mean()*100,
    polarity=lambda x: x.std()  # high std = polarized
).reset_index()

course_summary = course_summary.merge(
    courses[["course_id","course_title"]], on="course_id", how="left"
)

# Filter keyword
filtered = course_summary[course_summary["course_title"].str.lower().str.contains(keyword)]
filtered_sorted = filtered.sort_values("avg_score", ascending=False)

# Print with mixed review handling
print(f"\nTop courses for '{keyword}':\n")
for _, row in filtered_sorted.iterrows():
    polar_flag = "⚡ Polarized" if row["polarity"]>0.5 else ""
    print(f"{row['course_title']}  --> Avg: {row['avg_score']:.2f} | "
          f"Pos: {row['pos_pct']:.1f}%  Neu: {row['neu_pct']:.1f}%  Neg: {row['neg_pct']:.1f}% {polar_flag}")
