In [6]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Categorize

In [7]:
student_data = pd.read_csv("../../InputData/StudentDataset/original/student-mat.csv", sep=';')
print(len(student_data))
student_data[:3]

395


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


In [8]:
all_attributes = student_data.columns.tolist()
print(all_attributes)

['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [9]:
# build a new set of attributes: xxx_C, and copy old values
cat_attributes = ['student\'s school', 'student\'s sex', 'student\'s age', 'student\'s home address type', 'family size', 'parent\'s cohabitation status', 'mother\'s education', 'father\'s education', 'mother\'s job', 'father\'s job', 'reason to choose this school', 'student\'s guardian', 'home to school travel time', 'weekly study time', 'number of past class failures', 'extra educational support', 'family educational support', 'extra paid classes within the course subject', 'extra-curricular activities', 'attended nursery school', 'wants to take higher education', 'Internet access at home', 'with a romantic relationship', 'quality of family relationships', 'free time after school', 'going out with friends', 'workday alcohol consumption', 'weekend alcohol consumption', 'current health status', 'number of school absences', 'first period grade', 'second period grade', 'final grade']

num_att = len(cat_attributes)
for i in range(num_att):
    student_data[cat_attributes[i]] = student_data[all_attributes[i]]

student_data[:4]



Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,quality of family relationships,free time after school,going out with friends,workday alcohol consumption,weekend alcohol consumption,current health status,number of school absences,first period grade,second period grade,final grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15


In [10]:
student_data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,quality of family relationships,free time after school,going out with friends,workday alcohol consumption,weekend alcohol consumption,current health status,number of school absences,first period grade,second period grade,final grade
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,...,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,...,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,...,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,...,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,...,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,...,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [11]:
# categorize XXX_C values

student_data['student\'s sex'].replace(to_replace=['M', 'F'], value=['male', 'female'], inplace=True)

student_data['student\'s age'].replace(to_replace=[15, 16, 17, 18, 19, 20, 21, 22],
                              value=['15-17', '15-17','15-17', '18-20',  '18-20', '18-20', '21-22', '21-22'], inplace=True)

student_data['student\'s home address type'].replace(to_replace=['U', 'R'], value=['urban', 'rural'], inplace=True)

student_data['family size'].replace(to_replace=['LE3', 'GT3'], value=['less or equal to 3', 'greater than 3'], inplace=True)

student_data['parent\'s cohabitation status'].replace(to_replace=['T', 'A'], value=['living together', 'part'], inplace=True)

# Medu/Fedu is already numeric: 0-4
student_data['mother\'s education'].replace(to_replace=[0, 1, 2, 3, 4], value=['none', 'primary education (4th grade)', '5th to 9th grade', 'secondary education', 'higher education'], inplace=True)

student_data['father\'s education'].replace(to_replace=[0, 1, 2, 3, 4], value=['none', 'primary education (4th grade)', '5th to 9th grade', 'secondary education', 'higher education'], inplace=True)

student_data['mother\'s job'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=['teacher', 'healthcare related', 'civial services', 'at home', 'other'], inplace=True)

student_data['father\'s job'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=['teacher', 'healthcare related', 'civial services', 'at home', 'other'], inplace=True)

student_data['reason to choose this school'].replace(to_replace=['home', 'reputation', 'course', 'other'],
                                 value=['close to home', 'school reputation', 'course preference', 'other'], inplace=True)

# 12 guardian - student's guardian (nominal: "mother", "father" or "other")


# travel time is numeric: 1-4
student_data['home to school travel time'].replace(to_replace=[1, 2, 3, 4], value=['< 15 min', '15 to 30 min', '30 min to 1 hour', '> 1 hour'], inplace=True)

# study time is numeric: 1-4
student_data['weekly study time'].replace(to_replace=[1, 2, 3, 4], value=['< 2 hours', '2 to 5 hours', '5 to 10 hours', '> 10 hours'], inplace=True)

# failures is numeric: n if 1<=n<3, else 4
student_data['number of past class failures'].replace(to_replace=[1, 2, 3, 4], value=['1', '2', '3', '> 4'], inplace=True)


# (numeric: from 1 - very bad to 5 - excellent)
student_data['quality of family relationships'].replace(to_replace=[1, 2, 3, 4, 5], value=['very bad', 'bad', 'average', 'good', 'excellent'], inplace=True)
#
# numeric: from 1 - very low to 5 - very high
student_data['free time after school'].replace(to_replace=[1, 2, 3, 4, 5], value=['very low', 'below average', 'average', 'above average', 'very high'], inplace=True)
#
#
# 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
student_data['going out with friends'].replace(to_replace=[1, 2, 3, 4, 5], value=['very low', 'below average', 'average', 'above average', 'very high'], inplace=True)

# 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
student_data['workday alcohol consumption'].replace(to_replace=[1, 2, 3, 4, 5], value=['very low', 'below average', 'average', 'above average', 'very high'], inplace=True)


# 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
student_data['weekend alcohol consumption'].replace(to_replace=[1, 2, 3, 4, 5], value=['very low', 'below average', 'average', 'above average', 'very high'], inplace=True)



# 29 health - current health status (numeric: from 1 - very bad to 5 - very good)
student_data['current health status'].replace(to_replace=[1, 2, 3, 4, 5], value=['very bad', 'bad', 'average', 'good', 'very good'], inplace=True)
#



In [12]:
student_data[:3]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,quality of family relationships,free time after school,going out with friends,workday alcohol consumption,weekend alcohol consumption,current health status,number of school absences,first period grade,second period grade,final grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,good,average,above average,very low,very low,average,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,excellent,average,average,very low,very low,average,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,good,average,below average,below average,average,average,10,7,8,10


In [13]:
# student_data_copy = student_data.copy(deep=True)


In [14]:
# 30 absences - number of school absences (numeric: from 0 to 93)
def absences(x):
    if x < 30:
        return '< 30'
    elif x <= 60:
        return '30 to 60'
    else:
        return '> 60'


student_data['number of school absences'] = student_data['absences'].map(lambda x: absences(x))


In [15]:

print(student_data[student_data['number of school absences'] == '< 30'])


    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... quality of family relationships free time 

In [16]:

# these grades are related with the course subject, Math or Portuguese:
# 31 G1 - first period grade (numeric: from 0 to 20)
# 31 G2 - second period grade (numeric: from 0 to 20)
# 32 G3 - final grade (numeric: from 0 to 20, output target)
#


def grades(x):
    if x < 5:
        return '< 5'
    elif x <= 10:
        return '5 to 10'
    elif x <= 15:
        return '11 to 15'
    else:
        return '> 15'


student_data['first period grade'] = student_data['G1'].map(lambda x: grades(x))

student_data['second period grade'] = student_data['G2'].map(lambda x: grades(x))

student_data['final grade'] = student_data['G3'].map(lambda x: grades(x))





In [17]:

student_data[:4]



Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,quality of family relationships,free time after school,going out with friends,workday alcohol consumption,weekend alcohol consumption,current health status,number of school absences,first period grade,second period grade,final grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,good,average,above average,very low,very low,average,< 30,5 to 10,5 to 10,5 to 10
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,excellent,average,average,very low,very low,average,< 30,5 to 10,5 to 10,5 to 10
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,good,average,below average,below average,average,average,< 30,5 to 10,5 to 10,5 to 10
3,GP,F,15,U,GT3,T,4,2,health,services,...,average,below average,below average,very low,very low,very good,< 30,11 to 15,11 to 15,11 to 15


In [18]:

## this is the categorized file!!!
student_data.to_csv("../../InputData/StudentDataset/original/student-mat_cat.csv", index=False)

