In [1]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Categorize

In [14]:
student_data = pd.read_csv("../../InputData/StudentDataset/original/student-mat.csv", sep=';')
print(len(student_data))
student_data[:3]

395


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


In [15]:
all_attributes = student_data.columns.tolist()
print(all_attributes)

['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [16]:
# build a new set of attributes: xxx_C, and copy old values
cat_attributes = list()
for a in all_attributes:
    newa = a+'_C'
    cat_attributes.append(newa)
    student_data[newa] = student_data[a]
print(cat_attributes)
student_data[:4]



['school_C', 'sex_C', 'age_C', 'address_C', 'famsize_C', 'Pstatus_C', 'Medu_C', 'Fedu_C', 'Mjob_C', 'Fjob_C', 'reason_C', 'guardian_C', 'traveltime_C', 'studytime_C', 'failures_C', 'schoolsup_C', 'famsup_C', 'paid_C', 'activities_C', 'nursery_C', 'higher_C', 'internet_C', 'romantic_C', 'famrel_C', 'freetime_C', 'goout_C', 'Dalc_C', 'Walc_C', 'health_C', 'absences_C', 'G1_C', 'G2_C', 'G3_C']


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15


In [17]:
student_data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,...,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,...,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,...,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,...,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,...,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,...,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [18]:
# categorize XXX_C values

student_data['school_C'].replace(to_replace=['GP', 'MS'], value=[0, 1], inplace=True)

student_data['sex_C'].replace(to_replace=['M', 'F'], value=[0, 1], inplace=True)

student_data['age_C'].replace(to_replace=[15, 16, 17, 18, 19, 20, 21, 22],
                              value=[0, 0,0, 0,  1, 1, 1, 1], inplace=True)

student_data['address_C'].replace(to_replace=['U', 'R'], value=[0, 1], inplace=True)

student_data['famsize_C'].replace(to_replace=['LE3', 'GT3'], value=[0, 1], inplace=True)

student_data['Pstatus_C'].replace(to_replace=['T', 'A'], value=[0, 1], inplace=True)

# Medu/Fedu is already numeric: 0-4
# student_data['Medu_C'].replace(to_replace=['U', 'R'], value=[0, 1], inplace=True)

# student_data['Fedu_C'].replace(to_replace=['LE3', 'GT3'], value=[0, 1], inplace=True)

student_data['Mjob_C'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=[0, 1, 2, 3, 4], inplace=True)

student_data['Fjob_C'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=[0, 1, 2, 3, 4], inplace=True)



student_data['reason_C'].replace(to_replace=['home', 'reputation', 'course', 'other'],
                                 value=[0, 1, 2, 3], inplace=True)

student_data['guardian_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# travel time is numeric: 1-4
#student_data['traveltime_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# study time is numeric: 1-4
# student_data['studytime_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# failures is numeric: n if 1<=n<3, else 4
# student_data['failures_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

student_data['schoolsup_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['famsup_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['paid_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['activities_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['nursery_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['higher_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['internet_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['romantic_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
# (numeric: from 1 - very bad to 5 - excellent)
# student_data['famrel_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
# numeric: from 1 - very low to 5 - very high
# student_data['freetime_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
#
# 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
# 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
# 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
# 29 health - current health status (numeric: from 1 - very bad to 5 - very good)


In [19]:
student_data[:3]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


In [20]:
# student_data_copy = student_data.copy(deep=True)


In [21]:
# 30 absences - number of school absences (numeric: from 0 to 93)
def absences(x):
    if x <= 30:
        return 0
    elif x <= 60:
        return 1
    else:
        return 2


student_data['absences_C'] = student_data['absences'].map(lambda x: absences(x))


In [23]:

print(student_data[student_data['absences_C'] > 0])


    school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
74      GP   F   16       U     GT3       T     3     3    other  services   
183     GP   F   17       U     LE3       T     3     3    other     other   
276     GP   F   18       R     GT3       A     3     2    other  services   
307     GP   M   19       U     GT3       T     4     4  teacher  services   
315     GP   F   19       R     GT3       T     2     3    other     other   

     ... famrel_C freetime_C  goout_C  Dalc_C  Walc_C health_C absences_C  \
74   ...        4          3        3       2       4        5          1   
183  ...        5          3        3       2       3        1          1   
276  ...        4          1        1       1       1        5          2   
307  ...        4          3        4       1       1        4          1   
315  ...        4          1        2       1       1        3          1   

    G1_C G2_C G3_C  
74    11   12   11  
183    9    9    8  
276  

In [24]:

# these grades are related with the course subject, Math or Portuguese:
# 31 G1 - first period grade (numeric: from 0 to 20)
# 31 G2 - second period grade (numeric: from 0 to 20)
# 32 G3 - final grade (numeric: from 0 to 20, output target)
#


def grades(x):
    if x <= 5:
        return 0
    elif x <= 10:
        return 1
    elif x <= 15:
        return 2
    else:
        return 3


student_data['G1_C'] = student_data['G1'].map(lambda x: grades(x))

student_data['G2_C'] = student_data['G2'].map(lambda x: grades(x))

student_data['G3_C'] = student_data['G3'].map(lambda x: grades(x))





In [25]:

student_data[:4]



Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,0,0,1,1
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,0,0,0,1
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,0,1,1,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,2,2,2


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,0,0,0,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,0,0,0,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,0,0,0,1


In [26]:

## this is the categorized file!!!
student_data.to_csv("../../InputData/StudentDataset/original/student-mat_cat.csv", index=False)

