In [1]:
import numpy as np
import pandas as pd

# https://archive.ics.uci.edu/ml/datasets/Student+Performance

In [2]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

In [4]:
dataset = pd.read_csv("input/student-por.csv")

In [5]:
dataset.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [6]:
meanG1 = dataset["G1"].mean()
meanG2 = dataset["G2"].mean()
meanG3 = dataset["G3"].mean()
meanG1G2G3 = round(sum([meanG1, meanG2, meanG3]))

print(
    "Mean of\nG1= {},\nG2= {},\nG3= {}\nRounded G1+G2+G3= {}".format(
        meanG1, meanG2, meanG3, meanG1G2G3
    )
)

Mean of
G1= 11.399075500770415,
G2= 11.570107858243452,
G3= 11.906009244992296
Rounded G1+G2+G3= 35


In [7]:
dataset["pass"] = dataset["G1"] + dataset["G2"] + dataset["G3"] >= meanG1G2G3
# dataset["pass"] = dataset["G1"] * 0.3 + dataset["G2"] * 0.3 + dataset["G3"] * 0.4 >= 10
# dataset["pass"] = dataset["G3"] > 10

In [8]:
dataset.drop(["G1", "G2", "G3"], axis=1, inplace=True)
# dataset[dataset["pass"]].sort_values(by=["G3", "G2", "G1"], ascending=False)
dataset.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,pass
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,4,False
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,2,False
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,6,True
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,0,True
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,0,True


In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 31 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

In [10]:
dummies_col = dataset.select_dtypes(exclude=["int64", "bool"]).columns
dummies_col
# reason traveltime guardian nursery romantic

Index(['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob',
       'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities',
       'nursery', 'higher', 'internet', 'romantic'],
      dtype='object')

In [11]:
dataset = pd.get_dummies(dataset, columns=dummies_col)
dataset.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1,0,0,1,0,1,1,0,1,0
1,17,1,1,1,2,0,5,3,3,1,...,1,0,1,0,0,1,0,1,1,0
2,15,1,1,1,2,0,4,3,2,2,...,1,0,0,1,0,1,0,1,1,0
3,15,4,2,1,3,0,3,2,2,1,...,0,1,0,1,0,1,0,1,0,1
4,16,3,3,1,2,0,4,3,2,1,...,1,0,0,1,0,1,1,0,1,0


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 57 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                649 non-null    int64
 1   Medu               649 non-null    int64
 2   Fedu               649 non-null    int64
 3   traveltime         649 non-null    int64
 4   studytime          649 non-null    int64
 5   failures           649 non-null    int64
 6   famrel             649 non-null    int64
 7   freetime           649 non-null    int64
 8   goout              649 non-null    int64
 9   Dalc               649 non-null    int64
 10  Walc               649 non-null    int64
 11  health             649 non-null    int64
 12  absences           649 non-null    int64
 13  pass               649 non-null    bool 
 14  school_GP          649 non-null    uint8
 15  school_MS          649 non-null    uint8
 16  sex_F              649 non-null    uint8
 17  sex_M           

In [13]:
dataset = dataset.sample(frac=1, random_state=13)
x = dataset.drop("pass", axis=1)
y = dataset["pass"]

In [14]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=13
)

In [19]:
def solve_algo(algo_name, algo):
#     algo.fit(x_train, y_train)
    #     y_pred = algo.predict(x_test)
    scores = cross_val_score(algo, x_train, y_train, cv=3)
    print(
        algo_name
        + " Accuracy: {} (+/- {}) ".format(
            round(scores.mean(), 2), round(scores.std() * 2, 2)
        )
    )

In [20]:
bnb = BernoulliNB()
gnb = GaussianNB()
mnb = MultinomialNB()

dtc = DecisionTreeClassifier(criterion="entropy")
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()

dtr = DecisionTreeRegressor()
knr = KNeighborsRegressor()
rfr = RandomForestRegressor()

lr = LinearRegression()

In [21]:
solve_algo("BernoulliNB", bnb)
solve_algo("GaussianNB", gnb)
solve_algo("MultinomialNB", mnb)

solve_algo("DecisionTreeClassifier", dtc)
solve_algo("KNeighborsClassifier", knn)
solve_algo("RandomForestClassifier", rfc)

solve_algo("DecisionTreeRegressor", dtr)
solve_algo("KNeighborsRegressor", knr)
solve_algo("RandomForestRegressor", rfr)

solve_algo("LinearRegression", lr)

BernoulliNB Accuracy: 0.74 (+/- 0.09) 
GaussianNB Accuracy: 0.74 (+/- 0.08) 
MultinomialNB Accuracy: 0.75 (+/- 0.16) 
DecisionTreeClassifier Accuracy: 0.65 (+/- 0.09) 
KNeighborsClassifier Accuracy: 0.63 (+/- 0.11) 
RandomForestClassifier Accuracy: 0.73 (+/- 0.14) 
DecisionTreeRegressor Accuracy: -0.39 (+/- 0.37) 
KNeighborsRegressor Accuracy: 0.01 (+/- 0.29) 
RandomForestRegressor Accuracy: 0.25 (+/- 0.13) 
LinearRegression Accuracy: 0.22 (+/- 0.25) 


In [22]:
import graphviz

dot_data = export_graphviz(
    dtc,
    out_file=None,
    label="all",
    impurity=False,
    proportion=True,
    feature_names=list(x_train),
    class_names=["fail", "pass"],
    filled=True,
    rounded=True,
)
graph = graphviz.Source(dot_data)

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
graph.format = "png"
graph.view(filename="digraph", directory="./")