## Data source
https://www.kaggle.com/datasets/whenamancodes/student-performance

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
math_df=pd.read_csv('student-mat.csv',sep=';')

In [3]:
port_df=pd.read_csv('student-por.csv',sep=';')

In [4]:
math_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
port_df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


We can treat both dataframes as separate and solve the feature relevance and prediction problem for both. Comparing the results can give us good insights on any differences/similarities between math and language scores.

## Math grades - feature importance
There are 3 grades given in the dataset. We will create an average overall grade as the target variable.

In [6]:
math_df['Grade'] =math_df[['G1','G2','G3']].mean(axis = 1)

In [7]:
math_df = math_df.drop(['G1', 'G2', 'G3'], axis=1)

In [8]:
math_df.isna().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
Grade         0
dtype: int64

No missing information

In [9]:
math_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   school      395 non-null    object 
 1   sex         395 non-null    object 
 2   age         395 non-null    int64  
 3   address     395 non-null    object 
 4   famsize     395 non-null    object 
 5   Pstatus     395 non-null    object 
 6   Medu        395 non-null    int64  
 7   Fedu        395 non-null    int64  
 8   Mjob        395 non-null    object 
 9   Fjob        395 non-null    object 
 10  reason      395 non-null    object 
 11  guardian    395 non-null    object 
 12  traveltime  395 non-null    int64  
 13  studytime   395 non-null    int64  
 14  failures    395 non-null    int64  
 15  schoolsup   395 non-null    object 
 16  famsup      395 non-null    object 
 17  paid        395 non-null    object 
 18  activities  395 non-null    object 
 19  nursery     395 non-null    o

In [10]:
#seperate categorical columns and numerical columns
cat_cols = math_df.select_dtypes(include=['object'])
cat_cols

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,GP,F,U,GT3,A,at_home,teacher,course,mother,yes,no,no,no,yes,yes,no,no
1,GP,F,U,GT3,T,at_home,other,course,father,no,yes,no,no,no,yes,yes,no
2,GP,F,U,LE3,T,at_home,other,other,mother,yes,no,yes,no,yes,yes,yes,no
3,GP,F,U,GT3,T,health,services,home,mother,no,yes,yes,yes,yes,yes,yes,yes
4,GP,F,U,GT3,T,other,other,home,father,no,yes,yes,no,yes,yes,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,U,LE3,A,services,services,course,other,no,yes,yes,no,yes,yes,no,no
391,MS,M,U,LE3,T,services,services,course,mother,no,no,no,no,no,yes,yes,no
392,MS,M,R,GT3,T,other,other,course,other,no,no,no,no,no,yes,no,no
393,MS,M,R,LE3,T,services,other,course,mother,no,no,no,no,no,yes,yes,no


In [11]:
cat_cols['Mjob'].value_counts()

other       141
services    103
at_home      59
teacher      58
health       34
Name: Mjob, dtype: int64

In [12]:
one_hot_encoded_data = pd.get_dummies(cat_cols)
one_hot_encoded_data

Unnamed: 0,school_GP,school_MS,sex_F,sex_M,address_R,address_U,famsize_GT3,famsize_LE3,Pstatus_A,Pstatus_T,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,1,0,1,0,0,1,1,0,1,0,...,1,0,0,1,0,1,1,0,1,0
1,1,0,1,0,0,1,1,0,0,1,...,1,0,1,0,0,1,0,1,1,0
2,1,0,1,0,0,1,0,1,0,1,...,1,0,0,1,0,1,0,1,1,0
3,1,0,1,0,0,1,1,0,0,1,...,0,1,0,1,0,1,0,1,0,1
4,1,0,1,0,0,1,1,0,0,1,...,1,0,0,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0,1,0,1,0,1,0,1,1,0,...,1,0,0,1,0,1,1,0,1,0
391,0,1,0,1,0,1,0,1,0,1,...,1,0,1,0,0,1,0,1,1,0
392,0,1,0,1,1,0,1,0,0,1,...,1,0,1,0,0,1,1,0,1,0
393,0,1,0,1,1,0,0,1,0,1,...,1,0,1,0,0,1,0,1,1,0


In [13]:
num_cols = math_df.select_dtypes(include = ['int64'])
num_cols

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences
0,18,4,4,2,2,0,4,3,4,1,1,3,6
1,17,1,1,1,2,0,5,3,3,1,1,3,4
2,15,1,1,1,2,3,4,3,2,2,3,3,10
3,15,4,2,1,3,0,3,2,2,1,1,5,2
4,16,3,3,1,2,0,4,3,2,1,2,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,20,2,2,1,2,2,5,5,4,4,5,4,11
391,17,3,1,2,1,0,2,4,5,3,4,2,3
392,21,1,1,1,1,3,5,5,3,3,3,3,3
393,18,3,2,3,1,0,4,4,1,3,4,5,0


In [14]:
cols1=num_cols.columns
cols1

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences'],
      dtype='object')

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler().fit(num_cols[cols1])
num_cols[cols1]=scaler.transform(num_cols[cols1])

In [16]:
num_cols

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences
0,0.428571,1.00,1.00,0.333333,0.333333,0.000000,0.75,0.50,0.75,0.00,0.00,0.50,0.080000
1,0.285714,0.25,0.25,0.000000,0.333333,0.000000,1.00,0.50,0.50,0.00,0.00,0.50,0.053333
2,0.000000,0.25,0.25,0.000000,0.333333,1.000000,0.75,0.50,0.25,0.25,0.50,0.50,0.133333
3,0.000000,1.00,0.50,0.000000,0.666667,0.000000,0.50,0.25,0.25,0.00,0.00,1.00,0.026667
4,0.142857,0.75,0.75,0.000000,0.333333,0.000000,0.75,0.50,0.25,0.00,0.25,1.00,0.053333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,0.714286,0.50,0.50,0.000000,0.333333,0.666667,1.00,1.00,0.75,0.75,1.00,0.75,0.146667
391,0.285714,0.75,0.25,0.333333,0.000000,0.000000,0.25,0.75,1.00,0.50,0.75,0.25,0.040000
392,0.857143,0.25,0.25,0.000000,0.000000,1.000000,1.00,1.00,0.50,0.50,0.50,0.50,0.040000
393,0.428571,0.75,0.50,0.666667,0.000000,0.000000,0.75,0.75,0.00,0.50,0.75,1.00,0.000000


In [17]:
target = math_df['Grade']

In [18]:
final_df = pd.concat([num_cols,one_hot_encoded_data,target], axis=1)

In [19]:
final_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes,Grade
0,0.428571,1.0,1.0,0.333333,0.333333,0.0,0.75,0.5,0.75,0.0,...,0,0,1,0,1,1,0,1,0,5.666667
1,0.285714,0.25,0.25,0.0,0.333333,0.0,1.0,0.5,0.5,0.0,...,0,1,0,0,1,0,1,1,0,5.333333
2,0.0,0.25,0.25,0.0,0.333333,1.0,0.75,0.5,0.25,0.25,...,0,0,1,0,1,0,1,1,0,8.333333
3,0.0,1.0,0.5,0.0,0.666667,0.0,0.5,0.25,0.25,0.0,...,1,0,1,0,1,0,1,0,1,14.666667
4,0.142857,0.75,0.75,0.0,0.333333,0.0,0.75,0.5,0.25,0.0,...,0,0,1,0,1,1,0,1,0,8.666667


In [28]:
to_drop=pd.DataFrame(final_df.corr()['Grade'][final_df.corr()['Grade']>=0.01])



Unnamed: 0,Grade
Medu,0.22426
Fedu,0.175852
studytime,0.134565
famrel,0.021653
school_GP,0.043285
sex_M,0.101122
address_U,0.107297
famsize_LE3,0.08256
Pstatus_A,0.043048
Mjob_health,0.129334


In [32]:
data = final_df[list(to_drop.index)]
data.head()

Unnamed: 0,Medu,Fedu,studytime,famrel,school_GP,sex_M,address_U,famsize_LE3,Pstatus_A,Mjob_health,...,guardian_father,schoolsup_no,famsup_no,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_no,Grade
0,1.0,1.0,0.333333,0.75,1,0,1,0,1,0,...,0,0,1,0,0,1,1,0,1,5.666667
1,0.25,0.25,0.333333,1.0,1,0,1,0,0,0,...,1,1,0,0,0,0,1,1,1,5.333333
2,0.25,0.25,0.333333,0.75,1,0,1,1,0,0,...,0,0,1,1,0,1,1,1,1,8.333333
3,1.0,0.5,0.666667,0.5,1,0,1,0,0,1,...,0,1,0,1,1,1,1,1,0,14.666667
4,0.75,0.75,0.333333,0.75,1,0,1,0,0,0,...,1,1,0,1,0,1,1,0,1,8.666667


Some of the features have little to no correlation with grades. Anything below 0.01 can be dropped.

In [33]:
data.to_csv('Math_preprocessed.csv')