## Student preformance prediction

### Importing necessary libraries

In [58]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### Loading the Dataset

In [59]:
df = pd.read_csv("student_data.csv")
df

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Alan,Reynolds,alan.reynolds.1996@gslingacademy.com,male,False,2,False,30,Construction Engineer,83,77,84,73,75,84,82
1996,1997,Thomas,Gilbert,thomas.gilbert.1997@gslingacademy.com,male,False,2,False,20,Software Engineer,89,65,73,80,87,67,73
1997,1998,Madison,Cross,madison.cross.1998@gslingacademy.com,female,False,5,False,14,Software Engineer,97,85,63,93,68,94,78
1998,1999,Brittany,Compton,brittany.compton.1999@gslingacademy.com,female,True,10,True,5,Business Owner,51,96,72,89,95,88,75


### Removing Unnecessary Data

In [60]:
df = df.drop(["id","first_name","last_name","email","gender","career_aspiration"], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   part_time_job               2000 non-null   bool 
 1   absence_days                2000 non-null   int64
 2   extracurricular_activities  2000 non-null   bool 
 3   weekly_self_study_hours     2000 non-null   int64
 4   math_score                  2000 non-null   int64
 5   history_score               2000 non-null   int64
 6   physics_score               2000 non-null   int64
 7   chemistry_score             2000 non-null   int64
 8   biology_score               2000 non-null   int64
 9   english_score               2000 non-null   int64
 10  geography_score             2000 non-null   int64
dtypes: bool(2), int64(9)
memory usage: 144.7 KB


### Changing (part_time_job , extracurricular_activities) to 1,0

In [61]:
df["part_time_job"] = df["part_time_job"].map({True: 1, False: 0})
df["extracurricular_activities"] = df["extracurricular_activities"].map({True: 1, False: 0})
df

Unnamed: 0,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,0,3,0,27,73,81,93,97,63,80,87
1,0,2,0,47,90,86,96,100,90,88,90
2,0,9,1,13,81,97,95,96,65,77,94
3,0,5,0,3,71,74,88,80,89,63,86
4,0,5,0,10,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...
1995,0,2,0,30,83,77,84,73,75,84,82
1996,0,2,0,20,89,65,73,80,87,67,73
1997,0,5,0,14,97,85,63,93,68,94,78
1998,1,10,1,5,51,96,72,89,95,88,75


### Converting students grades to GPA

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   part_time_job               2000 non-null   int64
 1   absence_days                2000 non-null   int64
 2   extracurricular_activities  2000 non-null   int64
 3   weekly_self_study_hours     2000 non-null   int64
 4   math_score                  2000 non-null   int64
 5   history_score               2000 non-null   int64
 6   physics_score               2000 non-null   int64
 7   chemistry_score             2000 non-null   int64
 8   biology_score               2000 non-null   int64
 9   english_score               2000 non-null   int64
 10  geography_score             2000 non-null   int64
dtypes: int64(11)
memory usage: 172.0 KB


In [63]:

math = df["math_score"]
history = df["history_score"]
physics = df["physics_score"]
chemistery = df["chemistry_score"]
biology = df["biology_score"]
english = df["english_score"]
geography = df["geography_score"]
GPA = []
for i in range(0,2000):
    average = (math[i]+history[i]+physics[i]+chemistery[i]+biology[i]+english[i]+geography[i]) / 7
    GPA.append(round(average, 0))
    average=0
df["GPA"] = GPA

### Preparing for AI model

In [64]:
X = df.drop(["GPA"],axis=1)
Y = df["GPA"]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.25,random_state=22)

### The AI Model

In [65]:
model = LinearRegression()
model.fit(X_train,Y_train)
predicted_data = model.predict(X_test)

### MAE,MSE,R2

In [66]:
MSE = round(mean_squared_error(Y_test,predicted_data))
MAE = round(mean_absolute_error(Y_test,predicted_data))
R2 = round(r2_score(Y_test,predicted_data) * 100)
print(f"MAE: {MAE}")
print(f"MSE: {MSE}")
print(f"R2: {R2}%")

MAE: 0
MSE: 0
R2: 100%
