In [5]:
import pandas as pd

df = pd.read_csv('../data/student-scores.csv')
print(df['gender'].unique())
print(df['career_aspiration'].unique())
print("Dataset shape: ",df.shape)
df.head()

from sklearn.preprocessing import LabelEncoder
import joblib

le_gender = LabelEncoder()
le_gender.fit(df['gender'])  # refit using your dataset

le_aspiration = LabelEncoder()
le_aspiration.fit(df['career_aspiration'])

joblib.dump(le_gender, '../model/le_gender.pkl')
joblib.dump(le_aspiration, '../model/le_aspiration.pkl')

print(le_aspiration.classes_)
print(le_gender.classes_)


['male' 'female']
['Lawyer' 'Doctor' 'Government Officer' 'Artist' 'Unknown'
 'Software Engineer' 'Teacher' 'Business Owner' 'Scientist' 'Banker'
 'Writer' 'Accountant' 'Designer' 'Construction Engineer' 'Game Developer'
 'Stock Investor' 'Real Estate Developer']
Dataset shape:  (2000, 17)
['Accountant' 'Artist' 'Banker' 'Business Owner' 'Construction Engineer'
 'Designer' 'Doctor' 'Game Developer' 'Government Officer' 'Lawyer'
 'Real Estate Developer' 'Scientist' 'Software Engineer' 'Stock Investor'
 'Teacher' 'Unknown' 'Writer']
['female' 'male']


In [6]:
df.columns

Index(['id', 'first_name', 'last_name', 'email', 'gender', 'part_time_job',
       'absence_days', 'extracurricular_activities', 'weekly_self_study_hours',
       'career_aspiration', 'math_score', 'history_score', 'physics_score',
       'chemistry_score', 'biology_score', 'english_score', 'geography_score'],
      dtype='object')

In [7]:
df.isnull().sum()

id                            0
first_name                    0
last_name                     0
email                         0
gender                        0
part_time_job                 0
absence_days                  0
extracurricular_activities    0
weekly_self_study_hours       0
career_aspiration             0
math_score                    0
history_score                 0
physics_score                 0
chemistry_score               0
biology_score                 0
english_score                 0
geography_score               0
dtype: int64

In [8]:
df.dtypes

id                             int64
first_name                    object
last_name                     object
email                         object
gender                        object
part_time_job                   bool
absence_days                   int64
extracurricular_activities      bool
weekly_self_study_hours        int64
career_aspiration             object
math_score                     int64
history_score                  int64
physics_score                  int64
chemistry_score                int64
biology_score                  int64
english_score                  int64
geography_score                int64
dtype: object

In [9]:
df.describe(include='all')

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
count,2000.0,2000,2000,2000,2000,2000,2000.0,2000,2000.0,2000,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
unique,,453,707,2000,2,2,,2,,17,,,,,,,
top,,Michael,Smith,paul.casey.1@gslingacademy.com,female,False,,False,,Software Engineer,,,,,,,
freq,,40,41,1,1002,1684,,1592,,315,,,,,,,
mean,1000.5,,,,,,3.6655,,17.7555,,83.452,80.332,81.3365,79.995,79.5815,81.2775,80.888
std,577.494589,,,,,,2.629271,,12.129604,,13.224906,12.736046,12.539453,12.777895,13.72219,12.027087,11.637705
min,1.0,,,,,,0.0,,0.0,,40.0,50.0,50.0,50.0,30.0,50.0,60.0
25%,500.75,,,,,,2.0,,5.0,,77.0,69.75,71.0,69.0,69.0,72.0,71.0
50%,1000.5,,,,,,3.0,,18.0,,87.0,82.0,83.0,81.0,81.0,83.0,81.0
75%,1500.25,,,,,,5.0,,28.0,,93.0,91.0,92.0,91.0,91.0,91.0,91.0


In [10]:
df.sample(10)

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
1746,1747,Bonnie,Brown,bonnie.brown.1747@gslingacademy.com,female,False,1,True,9,Unknown,64,89,91,99,60,98,93
1452,1453,Richard,Castillo,richard.castillo.1453@gslingacademy.com,male,False,3,False,15,Government Officer,91,68,62,82,70,92,73
598,599,Barbara,Schneider,barbara.schneider.599@gslingacademy.com,female,False,3,True,23,Accountant,86,73,83,93,46,91,70
204,205,April,Ryan,april.ryan.205@gslingacademy.com,female,False,1,False,1,Artist,98,95,90,81,82,86,98
1168,1169,Nicholas,Wallace,nicholas.wallace.1169@gslingacademy.com,male,False,3,False,21,Construction Engineer,94,81,88,81,71,92,73
787,788,Tyler,Calderon,tyler.calderon.788@gslingacademy.com,male,False,7,False,1,Business Owner,72,58,73,57,67,88,70
34,35,Lisa,Burns,lisa.burns.35@gslingacademy.com,female,False,3,True,31,Unknown,90,93,91,80,86,69,62
1806,1807,Mary,Malone,mary.malone.1807@gslingacademy.com,female,False,3,False,6,Real Estate Developer,74,63,74,63,78,90,83
1051,1052,Jeffrey,Knight,jeffrey.knight.1052@gslingacademy.com,male,False,2,False,15,Banker,93,84,86,82,62,87,84
1957,1958,Cindy,Bradley,cindy.bradley.1958@gslingacademy.com,female,False,2,False,3,Artist,62,67,85,97,98,62,73


In [11]:
df['average_score'] = df[['math_score','history_score','physics_score','chemistry_score','biology_score','english_score','geography_score']].mean(axis=1)

In [12]:
def custom_dropout_label(row):
    score_risk = row['average_score']<40
    absence_risk = row['absence_days']>7
    low_study_risk = row['weekly_self_study_hours']<5
    no_activities = row['extracurricular_activities']==False
    no_aspiration = str(row['career_aspiration']).lower() in ['unknown']
    job_risk = row['part_time_job']==True
    
    risk_factors = sum([score_risk, absence_risk,low_study_risk,no_activities,no_aspiration,job_risk])
    
    return 1 if risk_factors>=3 else 0

df['risk_label'] = df.apply(custom_dropout_label, axis=1)



In [13]:
df['risk_label'].value_counts()

risk_label
0    1775
1     225
Name: count, dtype: int64

In [14]:
df = df.drop(columns=['id','first_name','last_name','email'])

In [15]:
df['gender'] = le_gender.transform(df['gender'])
df['career_aspiration'] = le_aspiration.transform(df['career_aspiration'])

In [16]:
x = df.drop(columns=['risk_label'])
y = df['risk_label']

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)

model.fit(x_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
y_pred = model.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print("Accuracy: ", accuracy_score(y_test,y_pred))

print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))

print("Classification Report:\n",(y_test,y_pred))

Accuracy:  0.995
Confusion Matrix:
 [[352   0]
 [  2  46]]
Classification Report:
 (1860    0
353     0
1333    0
905     0
1289    0
       ..
965     0
1284    0
1739    0
261     0
535     0
Name: risk_label, Length: 400, dtype: int64, array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 

In [21]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Accuracy: ", accuracy_score(y_test,y_pred))

print("Confusion Matrix:\n ", confusion_matrix(y_test,y_pred))

print("Classifiaction report:\n ", classification_report(y_test,y_pred))

Accuracy:  0.98
Confusion Matrix:
  [[349   3]
 [  5  43]]
Classifiaction report:
                precision    recall  f1-score   support

           0       0.99      0.99      0.99       352
           1       0.93      0.90      0.91        48

    accuracy                           0.98       400
   macro avg       0.96      0.94      0.95       400
weighted avg       0.98      0.98      0.98       400



In [24]:
joblib.dump(model,'../model/trained_model.pkl')

['../model/trained_model.pkl']