In [29]:
# Bibliotecas de uso geral
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

In [30]:
# Dataset de vinhos
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                             'ml/machine-learning-databases/'
                             'wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol',
                    'Malic acid', 'Ash',
                    'Alcalinity of ash', 'Magnesium',
                    'Total phenols', 'Flavanoids',
                    'Nonflavanoid phenols',
                    'Proanthocyanins',
                    'Color intensity', 'Hue',
                    'OD280/OD315 of diluted wines',
                    'Proline']

# Pegando (X, y)
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

# Dividindo conjunto de dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0, stratify=y)

# Normalizando os dados com zscore
zscore = StandardScaler()
Xz_train = zscore.fit_transform(X_train)
Xz_test  = zscore.transform(X_test)

# Criando classificador para avaliação
clf = LogisticRegression(solver='liblinear', multi_class='ovr')

# Criando nosso kfold
k = 10
kfold = StratifiedKFold(n_splits=k,random_state=1).split(Xz_train, y_train)

# Scores de avaliação para as medidas E_i
scores = []

# Rodando a validação cruzada
for k, (train, test) in enumerate(kfold):
    clf.fit(Xz_train[train], y_train[train])
    score = clf.score(Xz_train[test], y_train[test])
    scores.append(score)

# Calculando o CV (média dos scores)
print('\nAcurácia do CV: %.3f (média) +/- %.3f (desvio)' % (np.mean(scores)*100, np.std(scores)*100))

# Treinando o modelo com todo o conjunto, de acordo com a validação temos um bom modelo
clf.fit(Xz_train, y_train)

# Validação final do modelo
print('\nAcurácia de treinamento: %.3f' % (np.mean(clf.score(Xz_train, y_train))*100))
print('\nAcurácia de teste: %.3f' % (np.mean(clf.score(Xz_test, y_test))*100))


Acurácia do CV: 98.667 (média) +/- 2.667 (desvio)

Acurácia de treinamento: 100.000

Acurácia de teste: 100.000
