In [1]:
import numpy as np
import pandas as pd
from numpy import genfromtxt
import sklearn
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.metrics import confusion_matrix


In [10]:
wine=pd.read_csv('data/strongdrink.txt')
print (wine.shape)
wine.head()

(176, 14)


Unnamed: 0,cultivar,alco,malic,ash,alk,magn,tot_phen,flav,nonfl_phen,proanth,color_int,hue,OD280rat,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [11]:
#Split data
X = wine[['alco','malic','tot_phen','color_int']]
y = wine['cultivar']

In [12]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size = 0.25,
       random_state=20)

#Count number of each cultivar in test set

print('Number of each Cultivar in original data:' ,np.unique(y, return_counts=True))
print('Number of each Cultivar in test data:' ,np.unique(y_train, return_counts=True))
print('Number of each Cultivar in test data:' ,np.unique(y_test, return_counts=True))


Number of each Cultivar in original data: (array([1, 2, 3]), array([59, 71, 46]))
Number of each Cultivar in test data: (array([1, 2, 3]), array([46, 50, 36]))
Number of each Cultivar in test data: (array([1, 2, 3]), array([13, 21, 10]))


In [13]:
LogReg = LogisticRegression(random_state=0, multi_class='multinomial',solver='newton-cg')
LogReg.fit(X_train, y_train)
y_pred = LogReg.predict(X_test)

print('The coefficients for Cultivar 1 are:', LogReg.coef_[1])
print('The coefficients for Cultivar 2 are:', LogReg.coef_[2])

The coefficients for Cultivar 1 are: [-1.46798523 -0.33305092  0.66400603 -0.92270882]
The coefficients for Cultivar 2 are: [-0.2324475   0.59866064 -1.8879004   0.89996106]


In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[13  0  0]
 [ 2 19  0]
 [ 0  0 10]]


In [15]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.87      1.00      0.93        13
           2       1.00      0.90      0.95        21
           3       1.00      1.00      1.00        10

   micro avg       0.95      0.95      0.95        44
   macro avg       0.96      0.97      0.96        44
weighted avg       0.96      0.95      0.96        44



In [16]:
print('Cultivar 1 error rate = 0.13')
print('Cultivar 2 error rate = 0.0')
print('Cultivar 3 error rate = 0.0')

Cultivar 1 error rate = 0.13
Cultivar 2 error rate = 0.0
Cultivar 3 error rate = 0.0


In [17]:
#Because the 
mse_test = np.sum(y_test != y_pred)/(y_pred.shape[0])

print('MSE:', mse_test)

MSE: 0.045454545454545456


##### LOOCV

In [18]:
Xvals = X.values
yvals = y.values
N_loo = Xvals.shape[0]
loo = LeaveOneOut()
loo.get_n_splits(Xvals)

y_test_vec = np.zeros(N_loo)
y_pred_vec = np.zeros(N_loo)

for train_index, test_index in loo.split(Xvals):
    X_trainb, X_testb = Xvals[train_index], Xvals[test_index]
    y_trainb, y_testb = yvals[train_index], yvals[test_index]
    LogReg = LogisticRegression(random_state=0, multi_class='multinomial',solver='newton-cg')
    LogReg.fit(X_trainb, y_trainb)
    y_predb = LogReg.predict(X_testb)
    y_test_vec[test_index]=y_testb
    y_pred_vec[test_index]=y_predb


mse_test_loo = np.sum(y_test_vec != y_pred_vec)/(y_pred_vec.shape[0])

print('test estimate MSE loocv=', mse_test_loo)


test estimate MSE loocv= 0.07954545454545454


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrixloo = confusion_matrix(y_test_vec, y_pred_vec)
confusion_matrixloo

array([[55,  4,  0],
       [ 5, 64,  2],
       [ 1,  2, 43]])

In [20]:
print(classification_report(y_test_vec, y_pred_vec))

              precision    recall  f1-score   support

         1.0       0.90      0.93      0.92        59
         2.0       0.91      0.90      0.91        71
         3.0       0.96      0.93      0.95        46

   micro avg       0.92      0.92      0.92       176
   macro avg       0.92      0.92      0.92       176
weighted avg       0.92      0.92      0.92       176



##### K-fold CV

In [21]:
k = 4
kf = KFold(n_splits=k, random_state=10, shuffle=True)
kf.get_n_splits(Xvals)

y_test_veck = np.zeros(N_loo)
y_pred_veck = np.zeros(N_loo)

k_ind = int(0)

for train_index, test_index in kf.split(Xvals):
    X_traink, X_testk = Xvals[train_index], Xvals[test_index]
    y_traink, y_testk = yvals[train_index], yvals[test_index]
    LogReg = LogisticRegression(random_state=0, multi_class='multinomial',solver='newton-cg')
    LogReg.fit(X_traink, y_traink)
    y_predk = LogReg.predict(X_testk)
    y_test_veck[test_index] = y_testk
    y_pred_veck[test_index] = y_predk
    
    # print('MSE for test set', k_ind, ' is', MSE_vec_kf[k_ind])
    k_ind += 1

mse_test_k = np.sum(y_test_veck != y_pred_veck)/(y_pred_veck.shape[0])

print('test estimate MSE k-fold =', mse_test_k)

test estimate MSE k-fold = 0.09090909090909091


In [22]:
confusion_matrixk = confusion_matrix(y_test_veck, y_pred_veck)
confusion_matrixk

array([[55,  4,  0],
       [ 7, 62,  2],
       [ 1,  2, 43]])

In [23]:
print(classification_report(y_test_veck, y_pred_veck))

              precision    recall  f1-score   support

         1.0       0.87      0.93      0.90        59
         2.0       0.91      0.87      0.89        71
         3.0       0.96      0.93      0.95        46

   micro avg       0.91      0.91      0.91       176
   macro avg       0.91      0.91      0.91       176
weighted avg       0.91      0.91      0.91       176

