IMPORTING LIBRARIES


In [298]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

import joblib

IMPORTING DATA SET

In [299]:
df = pd.read_csv("xAPI-Edu-Data.csv")
df.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


X And y

In [300]:
df.shape

(480, 17)

In [301]:
df.isnull().sum()

Unnamed: 0,0
gender,0
NationalITy,0
PlaceofBirth,0
StageID,0
GradeID,0
SectionID,0
Topic,0
Semester,0
Relation,0
raisedhands,0


In [302]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    480 non-null    object
 1   NationalITy               480 non-null    object
 2   PlaceofBirth              480 non-null    object
 3   StageID                   480 non-null    object
 4   GradeID                   480 non-null    object
 5   SectionID                 480 non-null    object
 6   Topic                     480 non-null    object
 7   Semester                  480 non-null    object
 8   Relation                  480 non-null    object
 9   raisedhands               480 non-null    int64 
 10  VisITedResources          480 non-null    int64 
 11  AnnouncementsView         480 non-null    int64 
 12  Discussion                480 non-null    int64 
 13  ParentAnsweringSurvey     480 non-null    object
 14  ParentschoolSatisfaction  

In [303]:
df.describe()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion
count,480.0,480.0,480.0,480.0
mean,46.775,54.797917,37.91875,43.283333
std,30.779223,33.080007,26.611244,27.637735
min,0.0,0.0,0.0,1.0
25%,15.75,20.0,14.0,20.0
50%,50.0,65.0,33.0,39.0
75%,75.0,84.0,58.0,70.0
max,100.0,99.0,98.0,99.0


In [304]:
df['target'] = df['Class'].apply(lambda x: 1 if x == 'L' else 0)

In [305]:
df.drop(['Class','NationalITy', 'PlaceofBirth'], axis=1, inplace=True)

In [306]:
df.isnull().sum()

Unnamed: 0,0
gender,0
StageID,0
GradeID,0
SectionID,0
Topic,0
Semester,0
Relation,0
raisedhands,0
VisITedResources,0
AnnouncementsView,0


In [307]:
X = df.drop('target', axis=1)
y = df['target']

In [308]:
cat_cols = X.select_dtypes(include='object').columns

ct = ColumnTransformer(
    transformers=[
        ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

In [309]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [310]:
pipeline = Pipeline(steps=[
    ('preprocessing', ct),
    ('model', LogisticRegression(max_iter=1000))
])

In [311]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [312]:
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

compare = pd.DataFrame({
    "y_pred": y_pred,
    "y_test": y_test
})
compare

Unnamed: 0,y_pred,y_test
305,0,0
1,0,0
149,0,0
332,0,0
360,0,0
...,...,...
334,1,1
260,1,1
354,0,0
407,1,1


In [313]:
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96        88
           1       0.96      0.78      0.86        32

    accuracy                           0.93       120
   macro avg       0.94      0.88      0.91       120
weighted avg       0.94      0.93      0.93       120

ROC-AUC: 0.9879261363636365


In [314]:
df['risk_score'] = pipeline.predict_proba(X)[:, 1]

In [315]:
def risk_label(score):
    if score < 0.3:
        return "Low"
    elif score < 0.6:
        return "Medium"
    else:
        return "High"

df['risk_label'] = df['risk_score'].apply(risk_label)

In [316]:
def risk_reason(row):
    reasons = []

    if row.get("raisedhands", 0) < 5:
        reasons.append("Very low class participation")

    if row.get("VisITedResources", 0) < 10:
        reasons.append("Rarely uses learning resources")

    if row.get("AnnouncementsView", 0) < 5:
        reasons.append("Not checking course announcements")

    if row.get("StudentAbsenceDays", "Under-7") == "Above-7":
        reasons.append("High number of absences")

    if not reasons:
        reasons.append("General low academic engagement")

    return " | ".join(reasons)


def advisor_suggestions(row):
    actions = []

    if row.get("raisedhands", 0) < 5:
        actions.append("Encourage active participation in class.")

    if row.get("VisITedResources", 0) < 10:
        actions.append("Guide student to use online materials regularly.")

    if row.get("AnnouncementsView", 0) < 5:
        actions.append("Ask student to check announcements daily.")

    if row.get("StudentAbsenceDays", "Under-7") == "Above-7":
        actions.append("Discuss attendance issues and provide support.")

    if not actions:
        actions.append("Provide regular mentoring and monitor progress.")

    return " | ".join(actions)


In [317]:
df['predicted_dropout'] = (df['risk_score'] >= 0.6).astype(int)

df["risk_reason"] = df.apply(risk_reason, axis=1)
df["advisor_action"] = df.apply(advisor_suggestions, axis=1)

output = df.reset_index()[['index','risk_score','risk_label','predicted_dropout']]
output.rename(columns={'index':'student_id'}, inplace=True)
output.to_csv("student_dropout_predictions.csv", index=False)

In [318]:
high_risk_students = df[df["risk_label"] == "High"]

advisors_output = high_risk_students.reset_index()[[
    "index", "risk_score", "risk_label", "predicted_dropout",
    "risk_reason", "advisor_action"
]]

advisors_output.rename(columns={"index": "student_id"}, inplace=True)
advisors_output.to_csv("high_risk_students.csv", index=False)


In [319]:
feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()
coefficients = pipeline.named_steps['model'].coef_[0]

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': coefficients
}).sort_values(by='importance', ascending=False)

print(feature_importance.shape, feature_importance.head())

(34, 2)                      feature  importance
15  encoder__Topic_Chemistry    1.380849
7      encoder__GradeID_G-08    0.849588
0          encoder__gender_M    0.600299
22      encoder__Topic_Quran    0.487176
4      encoder__GradeID_G-05    0.385134


In [320]:
feature_importance.to_csv("feature_importance.csv", index=False)

In [321]:
joblib.dump(pipeline, "dropout_pipeline.pkl")
joblib.dump(ct, "preprocessing_pipeline.pkl")

['preprocessing_pipeline.pkl']

In [322]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib

st.set_page_config(page_title="Student Dropout Early Warning System", layout="wide")

# Load model and feature importance
model = joblib.load("dropout_pipeline.pkl")
feature_importance = pd.read_csv("feature_importance.csv")

st.title("üéì Student Dropout Early Warning System")
st.write("Upload a student CSV file to see dropout risk and advisor suggestions.")

uploaded_file = st.file_uploader("Upload Student CSV File", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)

    st.subheader("üìÑ Uploaded Data Preview")
    st.dataframe(df.head())

    # Predict risk
    probs = model.predict_proba(df)[:, 1]

    def risk_label(score):
        if score < 0.3:
            return "Low"
        elif score < 0.6:
            return "Medium"
        else:
            return "High"

    df["risk_score"] = probs
    df["risk_label"] = df["risk_score"].apply(risk_label)
    df["predicted_dropout"] = (df["risk_score"] >= 0.6).astype(int)

    st.subheader("üî¥ Top 20 High-Risk Students")
    top_risk = df.sort_values("risk_score", ascending=False).head(20)
    st.dataframe(top_risk)

    st.subheader("üîç Select a Student")
    idx = st.selectbox("Select student row index:", df.index)

    st.write("### üéØ Risk Details")
    st.write(df.loc[idx, ["risk_score", "risk_label", "predicted_dropout"]])

    st.subheader("üìä Top Reasons (Feature Importance)")
    st.dataframe(feature_importance.head(10))




Overwriting app.py


In [323]:
%%writefile requirements.txt
streamlit
pandas
scikit-learn
joblib


Writing requirements.txt
