O objetivo deste trabalho é comparar diversos métodos de classificação para a base de dados de qualidade de vinhos disponível em https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv.

Vocês devem encontrar um bom modelo preditivo, variando:
* o número e conjunto de features (atributos) utilizados
* o método utilizado
* a configuração do algoritmo correspondente (e.g.: número k para nearest neighbors, profundidade para árvore de decisão)

Vocês devem listar algumas métricas de qualidade, tais como: precision, recall, accuracy e f1_score, e utilizar accuracy como base para a avaliação final, considerando a accuracy média de 10 iterações para cada configuração.

Para assegurar que eu obterei os mesmos resultados de vocês, vocês devem estabelecer a semente para a geração dos números aleatórios (utilizados para separar os conjuntos de treinamento e teste, por exemplo), utilizando os seguintes comandos no início do seu código (podem utilizar uma outra semente):
```
import random
random.seed(1001001)
```

In [71]:
import random
random.seed(1001001)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
%matplotlib inline

In [109]:
df_vinho_vermelho = pd.read_csv('winequality-red.csv', sep = ';')
qtd_media_acc = 10
# Data source description:
# https://archive.ics.uci.edu/ml/datasets/Wine+Quality
#url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
# comment the next line if Internet access is available
#url = 'in.data/winequality-red.csv'
#df_vinho_vermelho = pd.read_csv(url)
df_vinho_vermelho.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [73]:
print(df_vinho_vermelho.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None


In [74]:
#Now seperate the dataset as response variable and feature variabes
X = df_vinho_vermelho.drop('quality', axis = 1)
sc = StandardScaler()
X = sc.fit_transform(X)
Y = df_vinho_vermelho['quality']
outcome_labels = Y.unique()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25)
print(outcome_labels)

[5 6 7 4 8 3]


In [110]:
def extrai_elemetos_matrix_confusao(confm):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    total = 0
    for i in range(len(confm)): 
        for j in range(len(confm[i])):
            total = total + confm[i, j]
            if i == j:
                TP = TP + confm[i, j]
            elif j > i:
                FP = FP + confm[i, j]
            elif j < i:
                FN = FN + confm[i, j]
        

        print(confm[i])
    TN = total - TP + FP + FN
    return TP, FP, TN, FN
#c_report, confm = faz_aprendizado_rfc(X_train, X_test, y_train, y_test, 300)
extrai_elemetos_matrix_confusao(confm)

[0 0 1 0 0 0]
[ 0  0 12  5  0  0]
[  0   1 152  32   2   0]
[ 0  0 36 99 10  0]
[ 0  0  3 17 28  0]
[0 0 0 1 1 0]


(279, 62, 242, 59)

In [93]:
def faz_aprendizado_knc(X_train, X_test, y_train, y_test, n):
    #Create an instance of K-nearest neighbor classifier
    knn_model = neighbors.KNeighborsClassifier(n_neighbors=n)
    KNeighborsClassifier_teste = knn_model.fit(X_train, y_train)
    KNeighborsClassifier_resultado = knn_model.predict(X_test)
    return classification_report(y_test, KNeighborsClassifier_resultado), confusion_matrix(y_test, KNeighborsClassifier_resultado), knn_model.score(X_train, y_train), knn_model.score(X_test, y_test)
#c_report, confm, accuracy_train, accuracy_test = faz_aprendizado_knc(X_train, X_test, y_train, y_test, 2)

def roda_knc(qtd_media_acc):
    n = 0
    avg_accuracy_train = 0
    avg_accuracy_test = 0
    melhor_c_report = melhor_confm = melhor_accuracy_train = melhor_accuracy_test = 0
    for n_neighbors in range(2, 7): 
        for i in range(qtd_media_acc):
            c_report, confm, accuracy_train, accuracy_test = faz_aprendizado_knc(X_train, X_test, y_train, y_test, n_neighbors)
            avg_accuracy_train = avg_accuracy_train + accuracy_train
            avg_accuracy_test = avg_accuracy_train + accuracy_test
            if melhor_accuracy_test < accuracy_test:
                n = n_neighbors
                melhor_c_report = c_report
                melhor_confm = confm
                melhor_accuracy_train = accuracy_train
                melhor_accuracy_test = accuracy_test
        avg_accuracy_train = avg_accuracy_train/qtd_media_acc 
        avg_accuracy_test = avg_accuracy_test/qtd_media_acc
    print('Melhor classificacao de teste tem %s centroids'%n)
    return melhor_c_report, melhor_confm, melhor_accuracy_train, melhor_accuracy_test, avg_accuracy_train/qtd_media_acc, avg_accuracy_test/qtd_media_acc
c_report, confm, _,_, avg_accuracy_train, avg_accuracy_test = roda_knc(qtd_media_acc)
#print(c_report, '\n\n')


print('average accuracy in training data:', '{:6.4f}'.format(avg_accuracy_train))
print('average accuracy in test data:    ', '{:6.4f}'.format(avg_accuracy_test))
#onfmT = confm.T
dfConfusionMatrix = pd.DataFrame(confmT)
dfConfusionMatrix.columns = ['true ' + str(val) for val in outcome_labels]
dfConfusionMatrix.index   = ['pred ' + str(val) for val in outcome_labels]
dfConfusionMatrix

Melhor classificacao de teste tem 5 centroids
('average accuracy in training data:', '0.0775')
('average accuracy in test data:    ', '0.0834')


Unnamed: 0,true 5,true 6,true 7,true 4,true 8,true 3
pred 5,0,2,2,0,0,0
pred 6,1,1,8,8,1,0
pred 7,0,10,148,67,9,0
pred 4,0,4,24,65,24,1
pred 8,0,0,4,5,14,1
pred 3,0,0,1,0,0,0


In [114]:
def faz_aprendizado_rfc(X_train, X_test, y_train, y_test, qtd_estimators):
    RandomForestClassifier_teste = RandomForestClassifier(n_estimators = qtd_estimators)
    RandomForestClassifier_teste.fit(X_train, y_train)
    RandomForestClassifier_resultado = RandomForestClassifier_teste.predict(X_test)
    return classification_report(y_test, RandomForestClassifier_resultado) , confusion_matrix(y_test, RandomForestClassifier_resultado)
c_report, confm = faz_aprendizado_rfc(X_train, X_test, y_train, y_test, 300)
confmT = confm.T
dfConfusionMatrix = pd.DataFrame(confmT)
dfConfusionMatrix.columns = ['true ' + str(val) for val in outcome_labels]
dfConfusionMatrix.index   = ['pred ' + str(val) for val in outcome_labels]
dfConfusionMatrix

Unnamed: 0,true 5,true 6,true 7,true 4,true 8,true 3
pred 5,0,0,0,0,0,0
pred 6,0,0,1,0,0,0
pred 7,1,13,150,36,3,0
pred 4,0,4,34,96,16,2
pred 8,0,0,2,13,29,0
pred 3,0,0,0,0,0,0


In [115]:
TP, FP, TN, FN = extrai_elemetos_matrix_confusao(confm)
# precision aka positive predictive value (PPV)
# = what fraction of the cases that my model got are true positive?
precision = TP / (TP + FP)
print('precision   ', '{:7.4f}'.format(precision))
#print('precision   ', '{:7.4f}'.format(metrics.precision_score(Y_test, Yhat)), '(from sklearn.metrics)')

# recall aka sensitivity aka hit rate aka true positive rate (TPR) = TP / P
# = what fraction of the positive cases did my model get?
recall = TP / (TP + FN)
print('recall      ', '{:7.4f}'.format(recall))
#print('recall      ', '{:7.4f}'.format(metrics.recall_score(Y_test, Yhat)), '(from sklearn.metrics)')

accuracy = (TP + TN) / (TP + TN + FP + FN)
print('accuracy    ', '{:7.4f}'.format(accuracy))
#print('accuracy    ', '{:7.4f}'.format(metrics.accuracy_score(Y_test, Yhat)), '(from sklearn.metrics)')

F1_score = 2 * precision * recall / (precision + recall)
print('F1_score    ', '{:7.4f}'.format(F1_score))
#print('F1_score    ', '{:7.4f}'.format(metrics.f1_score(Y_test, Yhat)), '(from sklearn.metrics)')

# specificity aka true negative rate (TNR) = TN / N
specificity = TN / (TN + FP)
print('specificity ', '{:7.4f}'.format(specificity))

[0 0 1 0 0 0]
[ 0  0 13  4  0  0]
[  0   1 150  34   2   0]
[ 0  0 36 96 13  0]
[ 0  0  3 16 29  0]
[0 0 0 2 0 0]
('precision   ', ' 0.0000')
('recall      ', ' 0.0000')
('accuracy    ', ' 0.0000')
('F1_score    ', ' 0.0000')
('specificity ', ' 0.0000')




In [65]:
def faz_aprendizado_svm(X_train, X_test, y_train, y_test):
    svc = SVC()
    svc.fit(X_train, y_train)
    SupportVectorMachines_resultado = svc.predict(X_test)
    return classification_report(y_test, SupportVectorMachines_resultado), confusion_matrix(y_test, SupportVectorMachines_resultado)
faz_aprendizado_svm(X_train, X_test, y_train, y_test)


(u'             precision    recall  f1-score   support\n\n          3       0.00      0.00      0.00         3\n          4       0.00      0.00      0.00        16\n          5       0.61      0.79      0.69       155\n          6       0.58      0.61      0.60       166\n          7       0.70      0.30      0.42        53\n          8       0.00      0.00      0.00         7\n\navg / total       0.57      0.60      0.57       400\n',
 array([[  0,   0,   3,   0,   0,   0],
        [  0,   0,  14,   2,   0,   0],
        [  0,   0, 123,  32,   0,   0],
        [  0,   0,  59, 102,   5,   0],
        [  0,   0,   3,  34,  16,   0],
        [  0,   0,   0,   5,   2,   0]]))

In [50]:
def faz_aprendizado_lr(X_train, X_test, y_train, y_test):
    LogisticRegression_teste = LogisticRegression()
    LogisticRegression_teste.fit(X_train, y_train)
    LogisticRegression_resultado = LogisticRegression_teste.predict(X_test)
    return accuracy_score(y_test, LogisticRegression_resultado)
faz_aprendizado_lr(X_train, X_test, y_train, y_test)

0.5975

In [51]:
def faz_aprendizado_dt(X_train, X_test, y_train, y_test):
    DecisionTreeClassifier_teste = DecisionTreeClassifier()
    DecisionTreeClassifier_teste.fit(X_train,y_train)
    DecisionTreeClassifier_resultado = DecisionTreeClassifier_teste.predict(X_train)
    return confusion_matrix(y_test, DecisionTreeClassifier_resultado), accuracy_score(y_test, DecisionTreeClassifier_resultado)
#faz_aprendizado_dt(X_train, X_test, y_train, y_test)

In [52]:
def faz_aprendizado_nb(X_train, X_test, y_train, y_test):
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    GaussianNB_resultado = nb.predict(X_test)
    return confusion_matrix(y_test, GaussianNB_resultado), accuracy_score(y_test, GaussianNB_resultado)
faz_aprendizado_nb(X_train, X_test, y_train, y_test)

(array([[  0,   2,   2,   1,   0,   0],
        [  0,   1,   8,   2,   1,   0],
        [  1,  10, 111,  37,   8,   0],
        [  0,   7,  37,  80,  30,   1],
        [  0,   0,   5,  18,  29,   1],
        [  0,   0,   0,   2,   6,   0]]), 0.5525)