## Student preformance prediction

### Importing necessary libraries

In [68]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### Loading the Dataset

In [69]:
df = pd.read_csv("student_data.csv")
df

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Alan,Reynolds,alan.reynolds.1996@gslingacademy.com,male,False,2,False,30,Construction Engineer,83,77,84,73,75,84,82
1996,1997,Thomas,Gilbert,thomas.gilbert.1997@gslingacademy.com,male,False,2,False,20,Software Engineer,89,65,73,80,87,67,73
1997,1998,Madison,Cross,madison.cross.1998@gslingacademy.com,female,False,5,False,14,Software Engineer,97,85,63,93,68,94,78
1998,1999,Brittany,Compton,brittany.compton.1999@gslingacademy.com,female,True,10,True,5,Business Owner,51,96,72,89,95,88,75


### Removing Unnecessary Data

In [70]:
df = df.drop(["id","first_name","last_name","email"], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   gender                      2000 non-null   object
 1   part_time_job               2000 non-null   bool  
 2   absence_days                2000 non-null   int64 
 3   extracurricular_activities  2000 non-null   bool  
 4   weekly_self_study_hours     2000 non-null   int64 
 5   career_aspiration           2000 non-null   object
 6   math_score                  2000 non-null   int64 
 7   history_score               2000 non-null   int64 
 8   physics_score               2000 non-null   int64 
 9   chemistry_score             2000 non-null   int64 
 10  biology_score               2000 non-null   int64 
 11  english_score               2000 non-null   int64 
 12  geography_score             2000 non-null   int64 
dtypes: bool(2), int64(9), object(2)
memory usage: 17

### Changing (part_time_job , extracurricular_activities) to 1,0

In [71]:
df["part_time_job"] = df["part_time_job"].map({True: 1, False: 0})
df["extracurricular_activities"] = df["extracurricular_activities"].map({True: 1, False: 0})
df["gender"] = df["gender"].map({"male": 1, "female": 0})
df

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,0,3,0,27,Lawyer,73,81,93,97,63,80,87
1,0,0,2,0,47,Doctor,90,86,96,100,90,88,90
2,0,0,9,1,13,Government Officer,81,97,95,96,65,77,94
3,0,0,5,0,3,Artist,71,74,88,80,89,63,86
4,1,0,5,0,10,Unknown,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,0,2,0,30,Construction Engineer,83,77,84,73,75,84,82
1996,1,0,2,0,20,Software Engineer,89,65,73,80,87,67,73
1997,0,0,5,0,14,Software Engineer,97,85,63,93,68,94,78
1998,0,1,10,1,5,Business Owner,51,96,72,89,95,88,75


### One Hot Encoding (career_aspiration)

In [72]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
data = ohe.fit_transform((df[["career_aspiration"]]))
df = df.sort_values(by=["career_aspiration"],ascending=1)
df = pd.concat([df, data], axis=1 ).drop(columns=['career_aspiration'])
df
# this cell made the code 60% more acurate!

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,...,career_aspiration_Game Developer,career_aspiration_Government Officer,career_aspiration_Lawyer,career_aspiration_Real Estate Developer,career_aspiration_Scientist,career_aspiration_Software Engineer,career_aspiration_Stock Investor,career_aspiration_Teacher,career_aspiration_Unknown,career_aspiration_Writer
1999,0,0,5,0,27,82,99,91,69,83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
416,0,0,4,0,30,95,78,93,85,48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422,0,0,4,0,15,79,79,97,78,31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1635,1,0,3,0,27,96,64,76,97,81,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
444,0,0,3,0,32,83,69,90,78,39,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1366,1,0,1,0,18,83,96,72,81,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1568,1,0,5,0,25,73,94,74,72,80,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1559,1,0,1,0,32,65,89,70,71,74,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1260,1,0,2,0,16,70,75,79,77,65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Converting students grades to GPA

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 1999 to 1497
Data columns (total 29 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   2000 non-null   int64  
 1   part_time_job                            2000 non-null   int64  
 2   absence_days                             2000 non-null   int64  
 3   extracurricular_activities               2000 non-null   int64  
 4   weekly_self_study_hours                  2000 non-null   int64  
 5   math_score                               2000 non-null   int64  
 6   history_score                            2000 non-null   int64  
 7   physics_score                            2000 non-null   int64  
 8   chemistry_score                          2000 non-null   int64  
 9   biology_score                            2000 non-null   int64  
 10  english_score                            2000 non-

In [74]:

math = df["math_score"]
history = df["history_score"]
physics = df["physics_score"]
chemistery = df["chemistry_score"]
biology = df["biology_score"]
english = df["english_score"]
geography = df["geography_score"]
GPA = []
for i in range(0,2000):
    average = (math[i]+history[i]+physics[i]+chemistery[i]+biology[i]+english[i]+geography[i]) / 7
    GPA.append(round(average, 0))
    average=0
df["GPA"] = GPA

### Preparing for AI model

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 1999 to 1497
Data columns (total 30 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   2000 non-null   int64  
 1   part_time_job                            2000 non-null   int64  
 2   absence_days                             2000 non-null   int64  
 3   extracurricular_activities               2000 non-null   int64  
 4   weekly_self_study_hours                  2000 non-null   int64  
 5   math_score                               2000 non-null   int64  
 6   history_score                            2000 non-null   int64  
 7   physics_score                            2000 non-null   int64  
 8   chemistry_score                          2000 non-null   int64  
 9   biology_score                            2000 non-null   int64  
 10  english_score                            2000 non-

In [76]:
X = df.drop(["GPA","math_score","history_score","physics_score","biology_score","chemistry_score","english_score","geography_score"],axis=1)
Y = df.drop(["part_time_job","absence_days","extracurricular_activities","weekly_self_study_hours"],axis=1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=45)

### The AI Model

In [77]:
model = LinearRegression()
model.fit(X_train,Y_train)
predicted_data = model.predict(X_test)

### MAE,MSE,R2

In [78]:
MSE = round(mean_squared_error(Y_test,predicted_data))
MAE = round(mean_absolute_error(Y_test,predicted_data))
R2 = round(r2_score(Y_test,predicted_data) * 100)
print(f"MAE: {MAE}")
print(f"MSE: {MSE}")
print(f"R2: {R2}%")

MAE: 3
MSE: 40
R2: 72%
