In [2]:
import pandas as pd
data=pd.read_csv('student-scores.csv')
df=data.copy()
data.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [3]:
df.drop(columns=['id','first_name','last_name','email'],axis=1,inplace=True)

In [4]:
df["total_score"] = df["math_score"] + df["history_score"] + df["physics_score"] + df["chemistry_score"] + df["biology_score"] + df["english_score"] + df["geography_score"]
df["average_score"] = df["total_score"] / 7
df.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


Encoding columns target\

In [5]:
gender_map = {'male': 0, 'female': 1}
part_time_job_map = {False: 0, True: 1}
extracurricular_activities_map = {False: 0, True: 1}
career_aspiration_map = {
        'Lawyer': 0, 'Doctor': 1, 'Government Officer': 2, 'Artist': 3, 'Unknown': 4,
        'Software Engineer': 5, 'Teacher': 6, 'Business Owner': 7, 'Scientist': 8,
        'Banker': 9, 'Writer': 10, 'Accountant': 11, 'Designer': 12,
        'Construction Engineer': 13, 'Game Developer': 14, 'Stock Investor': 15,
        'Real Estate Developer': 16
    }
# Apply mapping to the DataFrame
df['gender'] = df['gender'].map(gender_map)
df['part_time_job'] = df['part_time_job'].map(part_time_job_map)
df['extracurricular_activities'] = df['extracurricular_activities'].map(extracurricular_activities_map)
df['career_aspiration'] = df['career_aspiration'].map(career_aspiration_map)

In [6]:
df['career_aspiration'].value_counts()

career_aspiration
5     315
7     309
4     223
9     169
0     138
11    126
1     119
16     83
15     73
13     68
3      67
14     63
2      61
6      59
12     56
8      39
10     32
Name: count, dtype: int64

balance data

In [7]:
from imblearn.over_sampling import SMOTE

smote=SMOTE(random_state=42)

x=df.drop('career_aspiration',axis=1)
y=df['career_aspiration']

In [8]:
x_resampled,y_resampled=smote.fit_resample(x,y)

In [9]:
y_resampled.value_counts()

career_aspiration
0     315
9     315
15    315
14    315
13    315
12    315
11    315
10    315
8     315
1     315
7     315
6     315
5     315
4     315
3     315
2     315
16    315
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_resampled,y_resampled,random_state=33,test_size=0.2)

Feature scaling

In [11]:
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.fit_transform(x_test)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [13]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
}

In [14]:
for name,model in models.items():
    print("*"*50)
    print("Model",name)
    model.fit(x_train_scaled,y_train)
    y_pred=model.predict(x_test_scaled)
    score=accuracy_score(y_test,y_pred)
    conf_matrix=confusion_matrix(y_test,y_pred)

    print("Accuracy",score)
    print("Classification Report",classification_report(y_test,y_pred))
    print("Confusion Matrix:\n",conf_matrix)

**************************************************
Model Logistic Regression
Accuracy 0.49486461251167135
Classification Report               precision    recall  f1-score   support

           0       0.37      0.48      0.42        56
           1       0.50      0.60      0.55        65
           2       0.55      0.36      0.43        67
           3       0.59      0.51      0.55        68
           4       0.43      0.22      0.30        58
           5       0.30      0.28      0.29        64
           6       0.60      0.83      0.69        64
           7       0.81      0.89      0.85        65
           8       0.56      0.50      0.53        60
           9       0.25      0.11      0.15        64
          10       0.48      0.73      0.58        52
          11       0.56      0.50      0.53        62
          12       0.30      0.31      0.30        61
          13       0.44      0.56      0.49        68
          14       0.68      0.92      0.78        62
       

Random forest selected

In [15]:
rfc=RandomForestClassifier()
rfc.fit(x_train_scaled,y_train)

In [16]:
import pickle
pickle.dump(sc,open('models/sclaer.pkl','wb'))
pickle.dump(rfc,open('models/model.pkl','wb'))

In [22]:
sc=pickle.load(open('models/sclaer.pkl','rb'))
rfc=pickle.load(open('models/model.pkl','rb'))

In [20]:
import numpy as np
class_names = ['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
               'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
               'Banker', 'Writer', 'Accountant', 'Designer',
               'Construction Engineer', 'Game Developer', 'Stock Investor',
               'Real Estate Developer']
def Recommendations(gender, part_time_job, absence_days, extracurricular_activities,
                    weekly_self_study_hours, math_score, history_score, physics_score,
                    chemistry_score, biology_score, english_score, geography_score,
                    total_score,average_score):
    # encode categorical variables
    gender_encoded=1 if gender.lower()=='female' else 0
    part_time_job_encoded=1 if part_time_job else 0
    extracurricular_activities_encoded=1 if extracurricular_activities else 0
    feature_array=np.array([[gender_encoded,part_time_job_encoded,absence_days,extracurricular_activities_encoded,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score]])
    scaled_features=sc.transform(feature_array)
    probablities=rfc.predict_proba(scaled_features)
    top_classes_idx=np.argsort(-probablities[0])[:5]
    top_classes_names_probs=[(class_names[idx],probablities[0][idx]) for idx in top_classes_idx]
    return top_classes_names_probs

In [21]:
# Example usage 1
final_recommendations = Recommendations(gender='female',
                                        part_time_job=False,
                                        absence_days=2,
                                        extracurricular_activities=False,
                                        weekly_self_study_hours=7,
                                        math_score=65,
                                        history_score=60,
                                        physics_score=97,
                                        chemistry_score=94,
                                        biology_score=71,
                                        english_score=81,
                                        geography_score=66,
                                        total_score=534,
                                        average_score=76.285714)

print("Top recommended studies with probabilities:")
print("="*50)
for class_name, probability in final_recommendations:
    print(f"{class_name} with probability {probability}")

Top recommended studies with probabilities:
Teacher with probability 0.6565384490334935
Unknown with probability 0.25305391009004186
Real Estate Developer with probability 0.0451683389421105
Designer with probability 0.01613327778549678
Government Officer with probability 0.01492161354735335
