In [49]:
import pandas as pd

In [50]:
data = pd.read_csv("/content/student-combined.csv", sep=';')

In [51]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [52]:
data.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [53]:
data["final_grade"] = (data["G1"] + data["G2"] + data["G3"]) / 3

In [54]:
data["final_grade"] = data["final_grade"].round(2)

In [55]:
data.dtypes

Unnamed: 0,0
school,object
sex,object
age,int64
address,object
famsize,object
Pstatus,object
Medu,int64
Fedu,int64
Mjob,object
Fjob,object


In [56]:
numerical_cols = [feature for feature in data.columns if data[feature].dtype != "object"]
numerical_cols

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'Dalc',
 'Walc',
 'health',
 'absences',
 'G1',
 'G2',
 'G3',
 'final_grade']

In [57]:
categorical_cols = [feature for feature in data.columns if data[feature].dtype == 'object']
categorical_cols

['school',
 'sex',
 'address',
 'famsize',
 'Pstatus',
 'Mjob',
 'Fjob',
 'reason',
 'guardian',
 'schoolsup',
 'famsup',
 'paid',
 'activities',
 'nursery',
 'higher',
 'internet',
 'romantic']

In [58]:
data['at_risk'] = data['G3'].apply(lambda x: 1 if x <= 10 else 0)

print("Distribution of at_risk:")
print(data['at_risk'].value_counts())

Distribution of at_risk:
at_risk
0    661
1    383
Name: count, dtype: int64


In [59]:
#Alchohol Comsumption
data['alcohol_index'] = data['Dalc'] + data['Walc']

In [60]:
#Parent Edu
data['parents_education'] = data['Medu'] + data['Fedu']

In [61]:
#Grade Progress
data['progress_G1_G2'] = data['G2'] - data['G1']
data['grade_trajectory'] = data['progress_G1_G2'].apply(lambda x: 1 if x > 0 else 0)

In [62]:
# Attendance
data['attendance_risk'] = data['absences'].apply(lambda x: 1 if x > data['absences'].mean() else 0)

In [63]:
#Study Hours
data['study_intensity'] = data['studytime'] * (5 - data['traveltime'])

In [64]:
data["Leisure Time"] = data['freetime'] / data['studytime'].replace(0, 0.5)

In [65]:
# List of engineered features
newly_added_features = [
    'study_intensity', 'alcohol_index', 'parents_education',
    'progress_G1_G2', 'grade_trajectory', 'attendance_risk', 'Leisure Time'
]

missing_in_engineered = data[newly_added_features].isnull().sum()
print("Missing values in engineered features:")
print(missing_in_engineered)

Missing values in engineered features:
study_intensity      0
alcohol_index        0
parents_education    0
progress_G1_G2       0
grade_trajectory     0
attendance_risk      0
Leisure Time         0
dtype: int64


In [66]:
#Saving the data
data.to_csv('student-combined-final.csv', index=False, sep=';')

print("Updated dataset saved as 'student-combined-final.csv'")

Updated dataset saved as 'student-combined-final.csv'


In [67]:
data["at_risk"]

Unnamed: 0,at_risk
0,1
1,1
2,1
3,0
4,1
...,...
1039,1
1040,0
1041,1
1042,1


# **Model**

In [68]:
from sklearn.model_selection import train_test_split

# Define features and target
X = data.drop(columns=['G3', 'at_risk'])  # Features
y = data['at_risk']                      # Target variable

In [69]:
#splitting the data in train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [70]:
X.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'final_grade',
       'alcohol_index', 'parents_education', 'progress_G1_G2',
       'grade_trajectory', 'attendance_risk', 'study_intensity',
       'Leisure Time'],
      dtype='object')

In [71]:
numerical_cols = [
    'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures',
    'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences',
    'G1', 'G2','study_intensity', 'alcohol_index', 'parents_education',
    'progress_G1_G2', 'attendance_risk', 'Leisure Time'
]

categorical_cols = [
    'school', 'sex', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
    'reason', 'guardian', 'schoolsup', 'famsup', 'paid',
    'activities', 'nursery', 'higher', 'internet', 'romantic'
]

In [72]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

scaler = StandardScaler()
encoder = OneHotEncoder(drop='first', sparse_output=False)

X_train_numerical_scaled = scaler.fit_transform(X_train[numerical_cols])
X_train_numerical_scaled = pd.DataFrame(X_train_numerical_scaled, columns=numerical_cols)

X_test_numerical_scaled = scaler.transform(X_test[numerical_cols])
X_test_numerical_scaled = pd.DataFrame(X_test_numerical_scaled, columns=numerical_cols)

X_train_numerical_scaled.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,health,absences,G1,G2,study_intensity,alcohol_index,parents_education,progress_G1_G2,attendance_risk,Leisure Time
0,-0.585112,-0.568254,-1.282101,3.310075,-1.146393,-0.378483,-1.020383,-1.17959,-1.882027,-0.555748,...,-1.073894,-0.070919,-0.750328,-0.70244,-1.733848,-0.919745,-1.018912,-0.037583,-0.685617,-0.003674
1,0.234634,1.225668,-0.374852,-0.715041,2.416529,-0.378483,0.05676,-1.17959,-0.150355,-0.555748,...,0.320665,0.244909,0.923974,0.212119,2.708961,-0.919745,0.474578,-1.221812,1.458541,-1.258619
2,1.874125,-1.465216,-1.282101,1.96837,0.041247,4.31508,1.133902,0.766966,0.715481,1.659285,...,-1.073894,0.560737,-1.085188,-1.312146,-0.845287,1.118707,-1.516743,-0.629698,1.458541,-0.003674
3,-0.585112,1.225668,1.439648,-0.715041,0.041247,-0.378483,-2.097525,0.766966,0.715481,0.551769,...,0.320665,0.244909,-0.415467,-0.092734,0.339463,0.609094,1.470239,0.554531,1.458541,-0.003674
4,-1.404857,-1.465216,-0.374852,1.96837,0.041247,-0.378483,1.133902,1.740245,1.581317,-0.555748,...,1.017944,1.034479,-0.750328,-0.092734,-0.845287,0.099481,-1.018912,1.146645,1.458541,0.414641


In [73]:
X_train_categorical_encoded = encoder.fit_transform(X_train[categorical_cols])
X_train_categorical_encoded = pd.DataFrame(X_train_categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols))

X_test_categorical_encoded = encoder.transform(X_test[categorical_cols])
X_test_categorical_encoded = pd.DataFrame(X_test_categorical_encoded, columns=encoder.get_feature_names_out(categorical_cols))
X_train_categorical_encoded.head()

Unnamed: 0,school_MS,sex_M,famsize_LE3,Pstatus_T,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_health,Fjob_other,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0


In [74]:
#combining both back after scaling individually
X_train_preprocessed = pd.concat([X_train_numerical_scaled, X_train_categorical_encoded], axis=1)
X_test_preprocessed = pd.concat([X_test_numerical_scaled, X_test_categorical_encoded], axis=1)

print("\nTraining set shape after preprocessing:", X_train_preprocessed.shape)
print("Testing set shape after preprocessing:", X_test_preprocessed.shape)

print("\nPreprocessed Training Data:")
print(X_train_preprocessed.head())


Training set shape after preprocessing: (835, 46)
Testing set shape after preprocessing: (209, 46)

Preprocessed Training Data:
        age      Medu      Fedu  traveltime  studytime  failures    famrel  \
0 -0.585112 -0.568254 -1.282101    3.310075  -1.146393 -0.378483 -1.020383   
1  0.234634  1.225668 -0.374852   -0.715041   2.416529 -0.378483  0.056760   
2  1.874125 -1.465216 -1.282101    1.968370   0.041247  4.315080  1.133902   
3 -0.585112  1.225668  1.439648   -0.715041   0.041247 -0.378483 -2.097525   
4 -1.404857 -1.465216 -0.374852    1.968370   0.041247 -0.378483  1.133902   

   freetime     goout      Dalc  ...  guardian_mother  guardian_other  \
0 -1.179590 -1.882027 -0.555748  ...              1.0             0.0   
1 -1.179590 -0.150355 -0.555748  ...              1.0             0.0   
2  0.766966  0.715481  1.659285  ...              0.0             1.0   
3  0.766966  0.715481  0.551769  ...              1.0             0.0   
4  1.740245  1.581317 -0.555748  ... 

## **Model dependencies**

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## **Random forest**

In [76]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    max_depth=10,      # Maximum depth of each tree
    class_weight='balanced',  # Handle class imbalance
    random_state=42
)

# Train the model
rf_model.fit(X_train_preprocessed, y_train)

In [77]:
# Predictions
y_pred_rf = rf_model.predict(X_test_preprocessed)
y_pred_proba_rf = rf_model.predict_proba(X_test_preprocessed)[:, 1]

In [78]:
# Evaluation metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)

print("Random Forest Performance:")
print(f"Accuracy: {accuracy_rf:.3f}")
print(f"Precision: {precision_rf:.3f}")
print(f"Recall: {recall_rf:.3f}")
print(f"F1-Score: {f1_rf:.3f}")
print(f"ROC-AUC: {roc_auc_rf:.3f}")

Random Forest Performance:
Accuracy: 0.938
Precision: 0.920
Recall: 0.931
F1-Score: 0.926
ROC-AUC: 0.974


## **Logistic Regression**

In [79]:
lr_model = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='liblinear',
    class_weight='balanced',
    random_state=42
)

lr_model.fit(X_train_preprocessed, y_train)

In [80]:
# Predictions
y_pred_lr = lr_model.predict(X_test_preprocessed)
y_pred_proba_lr = lr_model.predict_proba(X_test_preprocessed)[:, 1]

In [81]:
# Evaluation metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_pred_proba_lr)

print("\nLogistic Regression Performance:")
print(f"Accuracy: {accuracy_lr:.3f}")
print(f"Precision: {precision_lr:.3f}")
print(f"Recall: {recall_lr:.3f}")
print(f"F1-Score: {f1_lr:.3f}")
print(f"ROC-AUC: {roc_auc_lr:.3f}")


Logistic Regression Performance:
Accuracy: 0.923
Precision: 0.882
Recall: 0.943
F1-Score: 0.911
ROC-AUC: 0.980


In [82]:
feature_names = X_train_preprocessed.columns
importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nMost Important contributing feature: ")
print(feature_importance_df.head(1))


Most Important contributing feature: 
   Feature  Importance
14      G2    0.347609


In [83]:
import joblib

# Saving the Random Forest model
joblib.dump(rf_model, 'random_forest_model.joblib')

joblib.dump(lr_model, 'logistic_regression_model.joblib')

print("Models saved successfully!")

Models saved successfully!
