# Trabalho de Aprendizado de Máquina 1



Inicialmente, importamos as bibliotecas necessárias.

In [67]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import csv

In [68]:
ranking: list = [];

def add_to_rank(entry: map):
    ranking.append(entry)

    # Ordena o ranking com base na chave 'score'
    ranking.sort(key=lambda x: x['score'], reverse=True)

Carrega o dataset e divide-o em treinamento e teste.

In [69]:
file = open('datasets/student_data_por.csv')
csvreader = csv.reader(file, delimiter =';')
header = next(csvreader)
data = []

for row in csvreader:
  data.append(row)

X = []
Y = []

# Pega os dados crus e os divide em atributos e classes.
for i in range(len(data)):
    l: list = list(data[i])

    # Os 3 últimos atributos são as notas.
    X.append(l[:-3])

    # Nota final.
    score = l[-1]

    numericScore = int(score)

    # Obs.: As notas vão até 20.
    if(numericScore < 10):
      gradeRange = 0
    else:
      gradeRange = 1
      
    Y.append(gradeRange)

Usa um `OneHotEncoder` para transformar as variáveis categóricas em variáveis numéricas.

In [70]:
enc = OneHotEncoder(handle_unknown='ignore');
X = enc.fit_transform(X)

Faz a padronização dos dados.

:warning: Ainda que a padronização não provoque alteração alguma no Grid Search do Decision Tree, ela faz com que a acurácia do kNN seja igual à acurácia do Decision Tree.

In [71]:
scaler = StandardScaler(with_mean=False).fit(X)
X = scaler.transform(X)

Divide os dados em conjuntos de treinamento e teste.

In [72]:
x_tr, x_te, y_tr, y_te = train_test_split(X, Y, test_size = 0.2, random_state=42, stratify=Y)

# Grid Search para Decision Tree

In [73]:
classifier = DecisionTreeClassifier()
param_grid = {'criterion': ['entropy', 'gini'],
              'max_depth': range(2,20,2),
              'min_samples_leaf': range(2,10,2),
              'min_impurity_decrease': np.linspace(0,0.5,10)}
gs = GridSearchCV(classifier, param_grid=param_grid)

gs.fit(x_tr, y_tr)
y_pred = gs.predict(x_te)

gs_dt_acc = accuracy_score(y_te, y_pred)
add_to_rank({'classifier': 'Decision Tree', 'score': gs_dt_acc})

print('Acurácia obtida: ', gs_dt_acc)


Acurácia obtida:  0.8153846153846154


# Faz um Grid Search para o kNN

In [74]:
classifier = KNeighborsClassifier()

classifier.fit(x_tr, y_tr)

y_pred = classifier.predict(x_te)
accuracy_score(y_te, y_pred)

# Grid Search with classifier
param_grid = {'weights': ['uniform', 'distance'],
              'n_neighbors': range(1,15,2),
              'metric': ['euclidean', 'manhattan']}
gs = GridSearchCV(classifier, param_grid=param_grid)

gs.fit(x_tr, y_tr)
y_pred = gs.predict(x_te)

gs_knn_acc = accuracy_score(y_te, y_pred)

print('Acurácia obtida: ', gs_knn_acc)
add_to_rank({'classifier': 'kNN', 'score': gs_knn_acc})

if gs_knn_acc > gs_dt_acc:
  print('Houve melhoria de ', gs_knn_acc - gs_dt_acc)
else:
    print('Houve piora de ', gs_dt_acc - gs_knn_acc)

Acurácia obtida:  0.7846153846153846
Houve piora de  0.03076923076923077


# Usando Random Forest

In [75]:
# Importa o random forest
from sklearn.ensemble import RandomForestClassifier

# Calcula o random forest
classifier = RandomForestClassifier()

# Usa o classifier para treinar
classifier.fit(x_tr, y_tr)

# Imprime a acurácia
y_pred = classifier.predict(x_te)

rf_acc = accuracy_score(y_te, y_pred)

# imprime os resultados
print('Acurácia obtida: ', rf_acc)
add_to_rank({'classifier': 'Random Forest', 'score': rf_acc})

Acurácia obtida:  0.8307692307692308


# Comparação dos métodos usados

In [76]:
print(ranking)

# Prints a ranking of classifiers
for i in range(len(ranking)):
    print(str(i+1) + ' - ' + ranking[i]['classifier'] + ': ' + str(ranking[i]['score']))

# Prints a ranking of classifiers showing difference to previous
for i in range(len(ranking)):
    if i == 0:
        print(str(i+1) + ' - ' + ranking[i]['classifier'] + ': ' + str(ranking[i]['score']))
    else:
        print(str(i+1) + ' - ' + ranking[i]['classifier'] + ': ' + str(ranking[i]['score'] - ranking[i-1]['score']))
    

[{'classifier': 'Random Forest', 'score': 0.8307692307692308}, {'classifier': 'Decision Tree', 'score': 0.8153846153846154}, {'classifier': 'kNN', 'score': 0.7846153846153846}]
1 - Random Forest: 0.8307692307692308
2 - Decision Tree: 0.8153846153846154
3 - kNN: 0.7846153846153846
1 - Random Forest: 0.8307692307692308
2 - Decision Tree: -0.015384615384615441
3 - kNN: -0.03076923076923077


# Tentando com o cross validation

In [77]:
# import cross validation
from sklearn.model_selection import cross_val_score

# create scores array
scores = cross_val_score(classifier, x_tr, y_tr, cv=2)

# create an array of all possible scoring methods
scoring_methods = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'precision']

for scoring_method in scoring_methods:
    print("[START] ", scoring_method)
    for k in range(2, 15):
        scores = cross_val_score(classifier, x_tr, y_tr, cv=k, scoring=scoring_method)
        print("[", k, "]", "P: ", scores.mean(), " DP: ", scores.std())
    print("[END] ", scoring_method)

[START]  accuracy
[ 2 ] P:  0.8497178497178497  DP:  0.0035640035640035483
[ 3 ] P:  0.8612716763005781  DP:  0.0047196334157672
[ 4 ] P:  0.8516547406082289  DP:  0.005977973967767312
[ 5 ] P:  0.8670276325616131  DP:  0.014369439553251253
[ 6 ] P:  0.86327185244587  DP:  0.023971947977621495
[ 7 ] P:  0.8477477477477479  DP:  0.01896482754967755
[ 8 ] P:  0.8554387019230769  DP:  0.013824172871532257
[ 9 ] P:  0.8631780600927607  DP:  0.022431635924557394
[ 10 ] P:  0.872737556561086  DP:  0.02215722338807082
[ 11 ] P:  0.8632333978078659  DP:  0.025948746653470446
[ 12 ] P:  0.8613019732205779  DP:  0.02647235225176476
[ 13 ] P:  0.8612426035502959  DP:  0.027205264125433344
[ 14 ] P:  0.8709103840682789  DP:  0.02733113489630855
[END]  accuracy
[START]  precision_macro
[ 2 ] P:  0.7289308435924977  DP:  0.06698752375444106
[ 3 ] P:  0.7508322663531862  DP:  0.028512496927004884
[ 4 ] P:  0.7676237926006925  DP:  0.05160377559106306
[ 5 ] P:  0.7425596812574167  DP:  0.0507904310500

  _warn_prf(average, modifier, msg_start, len(result))


[ 10 ] P:  0.7565729559524657  DP:  0.1570939230603897
[ 11 ] P:  0.8129086071984063  DP:  0.1277470625729207


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 