In [1]:

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Categorize

In [12]:
student_data = pd.read_csv("../../InputData/StudentDataset/original/student-mat.csv", sep=';')
print(len(student_data))
student_data[:3]

395


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


In [29]:
all_attributes = student_data.columns.tolist()
print(all_attributes)

['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3', 'school_C', 'sex_C', 'age_C', 'address_C', 'famsize_C', 'Pstatus_C', 'Medu_C', 'Fedu_C', 'Mjob_C', 'Fjob_C', 'reason_C', 'guardian_C', 'traveltime_C', 'studytime_C', 'failures_C', 'schoolsup_C', 'famsup_C', 'paid_C', 'activities_C', 'nursery_C', 'higher_C', 'internet_C', 'romantic_C', 'famrel_C', 'freetime_C', 'goout_C', 'Dalc_C', 'Walc_C', 'health_C', 'absences_C', 'G1_C', 'G2_C', 'G3_C']


In [14]:
# build a new set of attributes: xxx_C, and copy old values
cat_attributes = list()
for a in all_attributes:
    newa = a+'_C'
    cat_attributes.append(newa)
    student_data[newa] = student_data[a]
print(cat_attributes)
student_data[:4]



['school_C', 'sex_C', 'age_C', 'address_C', 'famsize_C', 'Pstatus_C', 'Medu_C', 'Fedu_C', 'Mjob_C', 'Fjob_C', 'reason_C', 'guardian_C', 'traveltime_C', 'studytime_C', 'failures_C', 'schoolsup_C', 'famsup_C', 'paid_C', 'activities_C', 'nursery_C', 'higher_C', 'internet_C', 'romantic_C', 'famrel_C', 'freetime_C', 'goout_C', 'Dalc_C', 'Walc_C', 'health_C', 'absences_C', 'G1_C', 'G2_C', 'G3_C']


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15


In [15]:
# categorize XXX_C values

student_data['school_C'].replace(to_replace=['GP', 'MS'], value=[0, 1], inplace=True)

student_data['sex_C'].replace(to_replace=['M', 'F'], value=[0, 1], inplace=True)

student_data['age_C'].replace(to_replace=[15, 16, 17, 18, 19, 20, 21, 22],
                              value=[0, 0,0, 0,  1, 1, 1, 1], inplace=True)

student_data['address_C'].replace(to_replace=['U', 'R'], value=[0, 1], inplace=True)

student_data['famsize_C'].replace(to_replace=['LE3', 'GT3'], value=[0, 1], inplace=True)

student_data['Pstatus_C'].replace(to_replace=['T', 'A'], value=[0, 1], inplace=True)

# Medu/Fedu is already numeric: 0-4
# student_data['Medu_C'].replace(to_replace=['U', 'R'], value=[0, 1], inplace=True)

# student_data['Fedu_C'].replace(to_replace=['LE3', 'GT3'], value=[0, 1], inplace=True)

student_data['Mjob_C'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=[0, 1, 2, 3, 4], inplace=True)

student_data['Fjob_C'].replace(to_replace=['teacher', 'health', 'services', 'at_home', 'other'],
                               value=[0, 1, 2, 3, 4], inplace=True)



student_data['reason_C'].replace(to_replace=['home', 'reputation', 'course', 'other'],
                                 value=[0, 1, 2, 3], inplace=True)

student_data['guardian_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# travel time is numeric: 1-4
#student_data['traveltime_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# study time is numeric: 1-4
# student_data['studytime_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

# numeric: n if 1<=n<3, else 4
# student_data['failures_C'].replace(to_replace=['mother', 'father', 'other'], value=[0, 1, 2], inplace=True)

student_data['schoolsup_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['famsup_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['paid_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['activities_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['nursery_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['higher_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['internet_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)

student_data['romantic_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
# (numeric: from 1 - very bad to 5 - excellent)
# student_data['famrel_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
# numeric: from 1 - very low to 5 - very high
# student_data['freetime_C'].replace(to_replace=['yes', 'no'], value=[0, 1], inplace=True)
#
#
# 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
# 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
# 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
# 29 health - current health status (numeric: from 1 - very bad to 5 - very good)


In [16]:
student_data[:3]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


In [17]:
student_data_copy = student_data.copy(deep=True)

In [21]:
# 30 absences - number of school absences (numeric: from 0 to 93)
idx0_30 = list(range(0, 30))
print(idx0_30)
values0 = [0]*30
print(values0)

idx30_60 = list(range(30, 60))
print(idx30_60)
values1 = [1]*30
print(values1)

idx60_ = list(range(60, 94))
print(idx60_)
values2 = [2]*30
print(values2)



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93]
[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [22]:
student_data_copy['absences_C'].replace(to_replace=idx0_30, value=values0, inplace=True)

student_data_copy['absences_C'].replace(to_replace=idx30_60, value=values1, inplace=True)

student_data_copy['absences_C'].replace(to_replace=idx60_, value=values2, inplace=True)


ValueError: Replacement lists must match in length. Expecting 34 got 30 

In [23]:

# these grades are related with the course subject, Math or Portuguese:
# 31 G1 - first period grade (numeric: from 0 to 20)
# 31 G2 - second period grade (numeric: from 0 to 20)
# 32 G3 - final grade (numeric: from 0 to 20, output target)
#

idx0_10 = list(range(0, 10))
print(idx0_10)
values0 = [0]*10
print(values0)

idx10_21 = list(range(10, 21))
print(idx10_21)
values1 = [1]*11
print(values1)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [24]:

student_data_copy['G1_C'].replace(to_replace=idx0_10, value=values0, inplace=True)

student_data_copy['G1_C'].replace(to_replace=idx10_21, value=values1, inplace=True)

In [25]:
student_data_copy['G2_C'].replace(to_replace=idx0_10, value=values0, inplace=True)

student_data_copy['G2_C'].replace(to_replace=idx10_21, value=values1, inplace=True)



In [26]:

student_data_copy['G3_C'].replace(to_replace=idx0_10, value=values0, inplace=True)

student_data_copy['G3_C'].replace(to_replace=idx10_21, value=values1, inplace=True)


In [27]:
student_data_copy[:3]


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel_C,freetime_C,goout_C,Dalc_C,Walc_C,health_C,absences_C,G1_C,G2_C,G3_C
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,0,0,0,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,0,0,0,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,0,0,0,1


In [28]:
## this is the categorized file!!!
student_data_copy.to_csv("../../InputData/StudentDataset/original/student-mat_cat.csv", index=False)

