In [1]:
from sklearn import tree
from sklearn.tree import export_text
import pandas as pd
import numpy as np

In [2]:
stu_data = pd.read_csv('combined_csv.csv')
stu_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,male,group B,some college,free/reduced,completed,65,62,61
1,male,group D,bachelor's degree,standard,none,61,59,54
2,male,group A,some high school,standard,none,57,50,50
3,male,group C,bachelor's degree,free/reduced,none,63,64,56
4,female,group B,associate's degree,free/reduced,completed,57,87,82


In [3]:
stu_data = stu_data.drop(columns=['parental level of education'])
stu_data

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score
0,male,group B,free/reduced,completed,65,62,61
1,male,group D,standard,none,61,59,54
2,male,group A,standard,none,57,50,50
3,male,group C,free/reduced,none,63,64,56
4,female,group B,free/reduced,completed,57,87,82
...,...,...,...,...,...,...,...
4995,female,group E,standard,completed,88,99,95
4996,male,group C,free/reduced,none,62,55,55
4997,female,group C,free/reduced,completed,59,71,65
4998,female,group D,standard,completed,68,78,77


In [4]:
# # Pass = 1
# # Fail = 0

# def parse_values(x):
#     if x > 90:
#        return "A"
#     elif x > 80:
#        return "B"
#     elif x > 70:
#        return "C"
#     elif x > 64:
#         return "D"
#     else:
#         return "F"

# stu_data['math grade'] = stu_data['math score'].apply(parse_values)
# stu_data['reading grade'] = stu_data['reading score'].apply(parse_values)
# stu_data['writing grade'] = stu_data['writing score'].apply(parse_values)
stu_data['pass_math'] = stu_data['math score'].apply(lambda x: 1 if x>64 else 0)
stu_data['pass_reading'] = stu_data['reading score'].apply(lambda x: 1 if x>64 else 0)
stu_data['pass_writing'] = stu_data['reading score'].apply(lambda x: 1 if x>64 else 0)
stu_data

Unnamed: 0,gender,race/ethnicity,lunch,test preparation course,math score,reading score,writing score,pass_math,pass_reading,pass_writing
0,male,group B,free/reduced,completed,65,62,61,1,0,0
1,male,group D,standard,none,61,59,54,0,0,0
2,male,group A,standard,none,57,50,50,0,0,0
3,male,group C,free/reduced,none,63,64,56,0,0,0
4,female,group B,free/reduced,completed,57,87,82,0,1,1
...,...,...,...,...,...,...,...,...,...,...
4995,female,group E,standard,completed,88,99,95,1,1,1
4996,male,group C,free/reduced,none,62,55,55,0,0,0
4997,female,group C,free/reduced,completed,59,71,65,0,1,1
4998,female,group D,standard,completed,68,78,77,1,1,1


In [5]:
stu_data = pd.get_dummies(stu_data)
stu_data = stu_data.drop(columns=['gender_male', 'lunch_standard', 'test preparation course_none'])
stu_data.head()

Unnamed: 0,math score,reading score,writing score,pass_math,pass_reading,pass_writing,gender_female,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,test preparation course_completed
0,65,62,61,1,0,0,0,0,1,0,0,0,1,1
1,61,59,54,0,0,0,0,0,0,0,1,0,0,0
2,57,50,50,0,0,0,0,1,0,0,0,0,0,0
3,63,64,56,0,0,0,0,0,0,1,0,0,1,0
4,57,87,82,0,1,1,1,0,1,0,0,0,1,1


In [6]:
stu_data = stu_data.drop(columns=['math score', 'reading score', 'writing score'])
stu_data.head()

Unnamed: 0,pass_math,pass_reading,pass_writing,gender_female,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,test preparation course_completed
0,1,0,0,0,0,1,0,0,0,1,1
1,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,0
4,0,1,1,1,0,1,0,0,0,1,1


In [7]:
target = stu_data['pass_math']
target_names = ['no', 'yes']

In [8]:
data = stu_data.drop('pass_math', axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,pass_reading,pass_writing,gender_female,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,test preparation course_completed
0,0,0,0,0,1,0,0,0,1,1
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0
4,1,1,1,0,1,0,0,0,1,1


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [10]:
# Create and score a decision tree classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(data, target)
r = export_text(clf)
clf.score(data, target)

0.83

In [11]:
sorted(zip(clf.feature_importances_, feature_names), reverse=True)

[(0.7692135448969066, 'pass_writing'),
 (0.10348973691756515, 'gender_female'),
 (0.07076668940413103, 'lunch_free/reduced'),
 (0.0333258799905831, 'race/ethnicity_group E'),
 (0.010310719473577378, 'race/ethnicity_group D'),
 (0.008555627999778631, 'test preparation course_completed'),
 (0.001991362309155244, 'race/ethnicity_group B'),
 (0.0015164973608640638, 'race/ethnicity_group A'),
 (0.0008299416474387407, 'race/ethnicity_group C'),
 (0.0, 'pass_reading')]

In [12]:
print(r)

|--- feature_1 <= 0.50
|   |--- feature_2 <= 0.50
|   |   |--- feature_8 <= 0.50
|   |   |   |--- feature_7 <= 0.50
|   |   |   |   |--- feature_6 <= 0.50
|   |   |   |   |   |--- feature_3 <= 0.50
|   |   |   |   |   |   |--- feature_9 <= 0.50
|   |   |   |   |   |   |   |--- feature_5 <= 0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_5 >  0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- feature_9 >  0.50
|   |   |   |   |   |   |   |--- feature_4 <= 0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- feature_4 >  0.50
|   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- feature_3 >  0.50
|   |   |   |   |   |   |--- feature_9 <= 0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- feature_9 >  0.50
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |--- feature_6 >  0.50
|   |   |   |   |   |--- feature_9 <= 0.50
|   |   |   |   |