In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
import matplotlib.pyplot as plt


In [2]:
DATASET_FILE = "udemy_courses.csv"
COMPLETED_THRESHOLD = 70.0
WEIGHT_USER = 0.6
WEIGHT_COURSE = 0.4
N_RECOMMENDATIONS = 10
N_USERS = 200


In [3]:
df_courses = None
course_embeddings = None
df_user_history = None


In [4]:
from google.colab import files
uploaded = files.upload()


Saving udemy_course_data.csv to udemy_course_data.csv


In [5]:
def init_data():
    global df_courses, course_embeddings, df_user_history

    dataset_file = list(uploaded.keys())[0]
    df_courses = pd.read_csv(dataset_file)

    required_cols = ["course_id", "course_title", "subject", "level", "num_subscribers"]
    for col in required_cols:
        if col not in df_courses.columns:
            raise ValueError(f"Missing column: {col}")

    # Course text for BERT
    df_courses["text"] = (
        df_courses["course_title"].astype(str) + " " +
        df_courses["subject"].astype(str) + " " +
        df_courses["level"].astype(str)
    )

    model = SentenceTransformer("all-MiniLM-L6-v2")
    course_embeddings = model.encode(df_courses["text"].tolist())

    # Course difficulty proxy
    max_sub = df_courses["num_subscribers"].max()
    df_courses["Intrinsic_Dropout_Rate"] = 100 - (
        (df_courses["num_subscribers"] / max_sub) * 100
    )

    # -------------------------
    # 🔥 REALISTIC USER BEHAVIOR
    # -------------------------
    np.random.seed(42)
    history = []

    for user_id in range(1, N_USERS + 1):

        n_courses = np.random.randint(6, 18)
        courses = np.random.choice(df_courses["course_id"], n_courses, replace=False)

        # Assign learner type probabilistically (Udemy-like)
        learner_type = np.random.choice(
            ["strong", "average", "weak"],
            p=[0.3, 0.4, 0.3]
        )

        for c in courses:
            if learner_type == "strong":
                completion = np.random.uniform(70, 100)
            elif learner_type == "average":
                completion = np.random.uniform(40, 90)
            else:
                completion = np.random.uniform(10, 60)

            history.append({
                "User_ID": user_id,
                "Course_ID": c,
                "Actual_Completion (%)": completion
            })

    df_user_history = pd.DataFrame(history)
    df_user_history["Dropout (%)"] = 100 - df_user_history["Actual_Completion (%)"]

    print("Dataset Loaded Successfully")
    print("Courses:", len(df_courses))
    print("Users:", N_USERS)

    return True


In [6]:
def classify_learner(user_avg_risk):
    if user_avg_risk < 30:
        return "Low Risk Learner"
    elif user_avg_risk < 60:
        return "Moderate Risk Learner"
    else:
        return "High Risk Learner"


In [7]:
def get_dynamic_weights(user_avg_risk):
    if user_avg_risk > 60:
        return 0.8, 0.2   # be conservative
    elif user_avg_risk < 30:
        return 0.4, 0.6   # allow challenge
    else:
        return 0.6, 0.4


In [8]:
def recommend_course_with_risk(user_id):

    user_history = df_user_history[df_user_history["User_ID"] == user_id]

    if user_history.empty:
        return "User not found",0,0,pd.DataFrame(),pd.DataFrame()

    user_avg_risk = user_history["Dropout (%)"].mean()
    learner_type = classify_learner(user_avg_risk)
    wu, wc = get_dynamic_weights(user_avg_risk)

    completed = user_history[
        user_history["Actual_Completion (%)"] >= COMPLETED_THRESHOLD
    ]

    # Cold-start handling
    if completed.empty:
        recs = df_courses.sort_values("Intrinsic_Dropout_Rate").head(10)
        message = f"""
### 🥇 Safe Starter Courses

**Learner Profile:** {learner_type}
(No completed courses found – cold start strategy applied)
"""
        return message, user_avg_risk, user_avg_risk, user_history, recs

    vectors = []
    for cid in completed["Course_ID"]:
        idx = df_courses.index[df_courses["course_id"] == cid][0]
        vectors.append(course_embeddings[idx])

    user_vector = np.mean(vectors, axis=0).reshape(1,-1)

    sims = cosine_similarity(user_vector, course_embeddings)[0]
    sim_series = pd.Series(sims, index=df_courses["course_id"])

    taken = user_history["Course_ID"].tolist()
    sim_series = sim_series.drop(taken, errors="ignore")

    top_ids = sim_series.nlargest(N_RECOMMENDATIONS*5).index
    recs = df_courses[df_courses["course_id"].isin(top_ids)].copy()

    recs["Similarity_Score"] = sim_series.loc[recs["course_id"]].values

    recs["Predicted_Dropout_Risk (%)"] = (
        wu * user_avg_risk + wc * recs["Intrinsic_Dropout_Rate"]
    )

    recs["Recommendation_Score"] = (
        recs["Similarity_Score"] -
        recs["Predicted_Dropout_Risk (%)"]/100
    )

    recs = recs.sort_values("Recommendation_Score", ascending=False)
    best = recs.iloc[0]

    message = f"""
### 🥇 Best Personalized Course: {best['course_title']}

**Learner Profile:** {learner_type}
**Dynamic Risk Weights:** User={wu}, Course={wc}

| Metric | Value |
|------|------|
| Subject | {best['subject']} |
| Level | {best['level']} |
| Similarity | {best['Similarity_Score']:.4f} |
| Predicted Risk | {best['Predicted_Dropout_Risk (%)']:.2f}% |
"""

    history_display = user_history[["Course_ID","Dropout (%)"]]
    top10 = recs.head(10)[
        ["course_title","subject","level",
         "Similarity_Score","Predicted_Dropout_Risk (%)"]
    ]

    return message, best["Predicted_Dropout_Risk (%)"], user_avg_risk, history_display, top10


In [9]:
def create_risk_meter(predicted,user_risk):
    fig,ax = plt.subplots(figsize=(4,1))
    ax.barh([0],[100],color="lightgray")
    ax.barh([0],[predicted],color="orange")
    ax.set_xlim(0,100)
    ax.set_yticks([])
    ax.set_title("Predicted Dropout Risk")
    ax.text(predicted,0,f"{predicted:.1f}%")
    return fig


In [10]:
def handle(uid):
    uid = int(uid)
    msg,pr,ur,hist,top10 = recommend_course_with_risk(uid)
    fig = create_risk_meter(pr,ur)
    return msg,ur,hist,top10,fig


In [11]:
if init_data():

    with gr.Blocks(theme=gr.themes.Soft(),
                   title="Risk-Adjusted Recommender") as demo:

        gr.Markdown("""
# Risk-Adjusted Course Recommendation Engine
### Adaptive & Risk-Aware MOOC Recommendation System
Enter User ID (1–200)
""")

        with gr.Row():
            uid = gr.Textbox(value="1",label="Enter User ID")
            btn = gr.Button("Analyze User & Recommend Course")

        gr.HTML("<hr>")

        with gr.Tabs():
            with gr.TabItem("🥇 Recommendation"):
                main_out = gr.Markdown()
                risk_plot = gr.Plot()
                user_prop = gr.Number(label="User Historical Dropout (%)")

            with gr.TabItem("📜 User History"):
                hist_out = gr.DataFrame()

            with gr.TabItem("📊 Top Courses"):
                top10_out = gr.DataFrame()

        btn.click(
            fn=handle,
            inputs=uid,
            outputs=[main_out,user_prop,hist_out,top10_out,risk_plot]
        )

    demo.launch(share=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Dataset Loaded Successfully
Courses: 3683
Users: 200


  with gr.Blocks(theme=gr.themes.Soft(),


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c7a250f8ccbfc86d83.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
