In [82]:
## ************ Importing libraries ******************
import pandas as pd 
import numpy as np 

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [83]:
df = pd.read_csv('student-mat.csv', delimiter=';')
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,385,386,387,388,389,390,391,392,393,394
school,GP,GP,GP,GP,GP,GP,GP,GP,GP,GP,...,MS,MS,MS,MS,MS,MS,MS,MS,MS,MS
sex,F,F,F,F,F,M,M,F,M,M,...,F,F,F,F,F,M,M,M,M,M
age,18,17,15,15,16,16,16,17,15,15,...,18,18,19,18,18,20,17,21,18,19
address,U,U,U,U,U,U,U,U,U,U,...,R,R,R,U,U,U,U,R,R,U
famsize,GT3,GT3,LE3,GT3,GT3,LE3,LE3,GT3,LE3,GT3,...,GT3,GT3,GT3,LE3,GT3,LE3,LE3,GT3,LE3,LE3
Pstatus,A,T,T,T,T,T,T,A,A,T,...,T,T,T,T,T,A,T,T,T,T
Medu,4,1,1,4,3,4,2,4,3,3,...,2,4,2,3,1,2,3,1,3,1
Fedu,4,1,1,2,3,3,2,4,2,4,...,2,4,3,1,1,2,1,1,2,1
Mjob,at_home,at_home,at_home,health,other,services,other,other,services,other,...,at_home,teacher,services,teacher,other,services,services,other,services,other
Fjob,teacher,other,other,services,other,other,other,teacher,other,other,...,other,at_home,other,services,other,services,services,other,other,at_home


In [84]:
df.shape

(395, 33)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [86]:
# our features
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [87]:
## Correlation matrix to evaluate dependency between G3 and the other features
matrix_corr.G3

age          -0.161579
Medu          0.217147
Fedu          0.152457
traveltime   -0.117142
studytime     0.097820
failures     -0.360415
famrel        0.051363
freetime      0.011307
goout        -0.132791
Dalc         -0.054660
Walc         -0.051939
health       -0.061335
absences      0.034247
G1            0.801468
G2            0.904868
G3            1.000000
Name: G3, dtype: float64

In [88]:
# divide the features into two lists : numerical features and categorical features 
features_num = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
features_categ = [col for col in df.columns if df[col].dtype == 'object']

In [89]:
features_num

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'Dalc',
 'Walc',
 'health',
 'absences',
 'G1',
 'G2',
 'G3']

In [90]:
features_categ

['school',
 'sex',
 'address',
 'famsize',
 'Pstatus',
 'Mjob',
 'Fjob',
 'reason',
 'guardian',
 'schoolsup',
 'famsup',
 'paid',
 'activities',
 'nursery',
 'higher',
 'internet',
 'romantic']

In [91]:
## displaying categories for each categorical feature
for categ in features_categ:
    print(categ," ==> ",df[categ].unique())

school  ==>  ['GP' 'MS']
sex  ==>  ['F' 'M']
address  ==>  ['U' 'R']
famsize  ==>  ['GT3' 'LE3']
Pstatus  ==>  ['A' 'T']
Mjob  ==>  ['at_home' 'health' 'other' 'services' 'teacher']
Fjob  ==>  ['teacher' 'other' 'services' 'health' 'at_home']
reason  ==>  ['course' 'other' 'home' 'reputation']
guardian  ==>  ['mother' 'father' 'other']
schoolsup  ==>  ['yes' 'no']
famsup  ==>  ['no' 'yes']
paid  ==>  ['no' 'yes']
activities  ==>  ['no' 'yes']
nursery  ==>  ['yes' 'no']
higher  ==>  ['yes' 'no']
internet  ==>  ['no' 'yes']
romantic  ==>  ['no' 'yes']


# 1 - Encodage

In [92]:
df['school']=df['school'].map({'GP':0, 'MS':1})
df['sex']=df['sex'].map({'M':0 ,'F':1})
df['address']=df['address'].map({'R':0 ,'U':1})
df['famsize']=df['famsize'].map({'LE3':0 ,'GT3':1})
df['Pstatus']=df['Pstatus'].map({'A':0 ,'T':1})
df['Mjob']=df['Mjob'].map({'at_home':0 ,'services':1, 'teacher':2, 'health':3, 'other':4})
df['Fjob']=df['Fjob'].map({'at_home':0 ,'services':1, 'teacher':2, 'health':3, 'other':4})
df['famsup']=df['famsup'].map({'no':0, 'yes':1})
df['reason']=df['reason'].map({'course':0 ,'home':1, 'reputation':2, 'other':3})
df['guardian']=df['guardian'].map({'mother':0 ,'father':1, 'other':2})
df['schoolsup']=df['schoolsup'].map({'no':0, 'yes':1})
df['paid']=df['paid'].map({'no':0, 'yes':1})
df['activities']=df['activities'].map({'no':0, 'yes':1})
df['nursery']=df['nursery'].map({'no':0, 'yes':1})
df['higher']=df['higher'].map({'no':0, 'yes':1})
df['internet']=df['internet'].map({'no':0, 'yes':1})
df['romantic']=df['romantic'].map({'no':0, 'yes':1})

In [93]:
## display our dataset after encoding
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,385,386,387,388,389,390,391,392,393,394
school,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
sex,1,1,1,1,1,0,0,1,0,0,...,1,1,1,1,1,0,0,0,0,0
age,18,17,15,15,16,16,16,17,15,15,...,18,18,19,18,18,20,17,21,18,19
address,1,1,1,1,1,1,1,1,1,1,...,0,0,0,1,1,1,1,0,0,1
famsize,1,1,0,1,1,0,0,1,0,1,...,1,1,1,0,1,0,0,1,0,0
Pstatus,0,1,1,1,1,1,1,0,0,1,...,1,1,1,1,1,0,1,1,1,1
Medu,4,1,1,4,3,4,2,4,3,3,...,2,4,2,3,1,2,3,1,3,1
Fedu,4,1,1,2,3,3,2,4,2,4,...,2,4,3,1,1,2,1,1,2,1
Mjob,0,0,0,3,4,1,4,4,1,4,...,0,2,1,2,4,1,1,4,1,4
Fjob,2,4,4,1,4,4,4,2,4,4,...,4,0,4,1,4,1,1,4,4,0


# 2 - Feature Selection

### Feature Selection for G3 

In [94]:
df_features = df.drop(['G2','G3'], axis=1)
## our traget is 
target = df['G3']
test = SelectKBest(score_func=chi2, k=10)
fit = test.fit(df_features, target)
## choix des features
feat_importances = pd.Series(fit.scores_, index=df_features.columns)
topFeatures = feat_importances.nlargest(50).copy().index.values

In [95]:
topFeatures

array(['absences', 'G1', 'failures', 'Walc', 'schoolsup', 'reason',
       'romantic', 'Dalc', 'Fjob', 'Mjob', 'paid', 'Medu', 'guardian',
       'goout', 'Fedu', 'school', 'health', 'sex', 'activities',
       'traveltime', 'studytime', 'address', 'famsup', 'famsize',
       'freetime', 'nursery', 'internet', 'age', 'famrel', 'Pstatus',
       'higher'], dtype=object)

In [96]:
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,1,18,1,1,0,4,4,0,2,...,4,3,4,1,1,3,6,5,6,6
1,0,1,17,1,1,1,1,1,0,4,...,5,3,3,1,1,3,4,5,5,6
2,0,1,15,1,0,1,1,1,0,4,...,4,3,2,2,3,3,10,7,8,10
3,0,1,15,1,1,1,4,2,3,1,...,3,2,2,1,1,5,2,15,14,15
4,0,1,16,1,1,1,3,3,4,4,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,0,20,1,0,0,2,2,1,1,...,5,5,4,4,5,4,11,9,9,9
391,1,0,17,1,0,1,3,1,1,1,...,2,4,5,3,4,2,3,14,16,16
392,1,0,21,0,1,1,1,1,4,4,...,5,5,3,3,3,3,3,10,8,7
393,1,0,18,0,0,1,3,2,1,4,...,4,4,1,3,4,5,0,11,12,10


In [97]:
## creating a variable "pass" for classification :  if G3 < 10 then pass=0, otherwise pass=1
df['pass']= np.where(df['G3']<10, 0, 1)
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,pass
0,0,1,18,1,1,0,4,4,0,2,...,3,4,1,1,3,6,5,6,6,0
1,0,1,17,1,1,1,1,1,0,4,...,3,3,1,1,3,4,5,5,6,0
2,0,1,15,1,0,1,1,1,0,4,...,3,2,2,3,3,10,7,8,10,1
3,0,1,15,1,1,1,4,2,3,1,...,2,2,1,1,5,2,15,14,15,1
4,0,1,16,1,1,1,3,3,4,4,...,3,2,1,2,5,4,6,10,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,1,0,20,1,0,0,2,2,1,1,...,5,4,4,5,4,11,9,9,9,0
391,1,0,17,1,0,1,3,1,1,1,...,4,5,3,4,2,3,14,16,16,1
392,1,0,21,0,1,1,1,1,4,4,...,5,3,3,3,3,3,10,8,7,0
393,1,0,18,0,0,1,3,2,1,4,...,4,1,3,4,5,0,11,12,10,1


In [98]:
## Saving the dataset that we are going to use for classification
df.to_csv('dataset_classification.csv')