In [152]:
# Libraries
'''importing libraries'''
from ucimlrepo import fetch_ucirepo 
import wget
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, log_loss

In [153]:
# Dataset
'''importing dataset'''
# pip install ucimlrepo

# fetch dataset 
wine = fetch_ucirepo(id=109) 
  
# data (as pandas dataframes) 
X = wine.data.features 
y = wine.data.targets 
  
# metadata 
print(wine.metadata) 
  
# variable information 
print(wine.variables) 


{'uci_id': 109, 'name': 'Wine', 'repository_url': 'https://archive.ics.uci.edu/dataset/109/wine', 'data_url': 'https://archive.ics.uci.edu/static/public/109/data.csv', 'abstract': 'Using chemical analysis to determine the origin of wines', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 178, 'num_features': 13, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1992, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C5PC7J', 'creators': ['Stefan Aeberhard', 'M. Forina'], 'intro_paper': {'title': 'Comparative analysis of statistical pattern recognition methods in high dimensional settings', 'authors': 'S. Aeberhard, D. Coomans, O. Vel', 'published_in': 'Pattern Recognition', 'year': 1994, 'url': 'https://www.semanticscholar.org/paper/83dc3e4030d7b9fbdbb4bde03ce12ab70ca10528', 'do

In [9]:
# Dataset
'''dowloading dataset'''
# pip install wget
url = ('https://archive.ics.uci.edu/static/public/109/data.csv')
wget.download(url, 'wineData.csv')

'wineData.csv'

In [10]:
# Dataset 
'''loading dataset'''
df = pd.read_csv('wineData.csv')

In [20]:
df.head()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline,class
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [154]:
X.head()

Unnamed: 0,Alcohol,Malicacid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,0D280_0D315_of_diluted_wines,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [155]:
y.head()

Unnamed: 0,class
0,1
1,1
2,1
3,1
4,1


In [23]:
df.shape

(178, 14)

In [111]:
X.shape

(178, 13)

In [112]:
y.shape

(178, 1)

In [194]:
# Data pre-processing
''' data normalization '''
X = preprocessing.StandardScaler().fit_transform(X)
y = df['class'].astype('int')

In [114]:
X[0:5]

array([[ 1.51861254, -0.5622498 ,  0.23205254, -1.16959318,  1.91390522,
         0.80899739,  1.03481896, -0.65956311,  1.22488398,  0.25171685,
         0.36217728,  1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, -2.49084714,  0.01814502,
         0.56864766,  0.73362894, -0.82071924, -0.54472099, -0.29332133,
         0.40605066,  1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, -0.2687382 ,  0.08835836,
         0.80899739,  1.21553297, -0.49840699,  2.13596773,  0.26901965,
         0.31830389,  0.78858745,  1.39514818],
       [ 1.69154964, -0.34681064,  0.4879264 , -0.80925118,  0.93091845,
         2.49144552,  1.46652465, -0.98187536,  1.03215473,  1.18606801,
        -0.42754369,  1.18407144,  2.33457383],
       [ 0.29570023,  0.22769377,  1.84040254,  0.45194578,  1.28198515,
         0.80899739,  0.66335127,  0.22679555,  0.40140444, -0.31927553,
         0.36217728,  0.44960118, -0.03787401]])

In [196]:
# Train/Test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(142, 13) (36, 13) (142,) (36,)


In [173]:
# Hyperparameter
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
              'solver': ['lbfgs'],
              'multi_class': ['multinomial']}

In [174]:
LR = LogisticRegression()
grid_search = GridSearchCV(estimator=LR, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

In [197]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [176]:
print(grid_search.best_params_)

{'C': 0.1, 'multi_class': 'multinomial', 'solver': 'lbfgs'}


In [198]:
# Modelling
'''logistic regression'''
LR = LogisticRegression(C=0.1, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
LR

In [199]:
'''prediction'''
yhat = LR.predict(X_test)
yhat

array([1, 1, 3, 1, 2, 1, 2, 3, 2, 3, 1, 3, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2,
       2, 3, 3, 3, 2, 2, 2, 1, 1, 2, 3, 1, 1, 1])

In [200]:
'''probabilities'''
yhat_prob = LR.predict_proba(X_test)
yhat_prob

array([[0.94530805, 0.03773528, 0.01695666],
       [0.92518421, 0.02379679, 0.051019  ],
       [0.04098037, 0.15120213, 0.80781751],
       [0.92548887, 0.06205686, 0.01245427],
       [0.03848017, 0.94841159, 0.01310824],
       [0.96814836, 0.01309447, 0.01875717],
       [0.0222454 , 0.96056492, 0.01718969],
       [0.02441979, 0.02825367, 0.94732654],
       [0.12751808, 0.86604402, 0.00643789],
       [0.07066795, 0.09894603, 0.83038602],
       [0.71389411, 0.27172904, 0.01437685],
       [0.01615874, 0.01625307, 0.96758818],
       [0.70434111, 0.21721983, 0.07843905],
       [0.0357522 , 0.54890829, 0.41533951],
       [0.97524814, 0.0131    , 0.01165186],
       [0.02746776, 0.94924456, 0.02328768],
       [0.01109786, 0.97570951, 0.01319263],
       [0.01492887, 0.97383776, 0.01123337],
       [0.95920882, 0.03317546, 0.00761572],
       [0.02587983, 0.95385284, 0.02026733],
       [0.99654988, 0.00156096, 0.00188915],
       [0.20192639, 0.79359047, 0.00448314],
       [0.

In [201]:
# Evaluation
print (classification_report(y_test, yhat))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [202]:
confusion_matrix(y_test, yhat, labels=[1, 2, 3])

array([[14,  0,  0],
       [ 0, 14,  0],
       [ 0,  0,  8]], dtype=int64)

In [203]:
log_loss(y_test, yhat_prob)

0.10847451737605941