In [1]:
#standard imports
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

## Loading Dataset

In [2]:
from sklearn.datasets import load_wine
wine = load_wine()
wine

{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]], shape=(178, 13)),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [3]:
#creating data frame 
wine_df = pd.DataFrame(data=wine["data"], columns=wine["feature_names"])
wine_df["target"] = wine["target"]
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [4]:
#checking for missing values
wine_df.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

## Scaling Data and Removing Outliers

In [6]:
#Creating features and labels
X = wine_df.drop("target", axis=1)
y = wine_df["target"]

#splitting into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Applying the Z-score to remove outliers

In [8]:
from scipy.stats import zscore
zscore = np.abs(zscore(X_train_scaled))
threshold = 3

#Detection rule
non_outlier = (zscore < threshold).all(axis=1)

X_train_no_outliers = X_train_scaled[non_outlier]
y_train_no_outliers = y_train[non_outlier]

## Applying Recursive Feature Elimination to select the most important features

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

model = LogisticRegression(max_iter = 1000)

rfe = RFE(model, n_features_to_select=5, verbose=2)
X_train_rfe = rfe.fit_transform(X_train_no_outliers, y_train_no_outliers)
X_test_rfe = rfe.transform(X_test_scaled)

feature_names = wine.feature_names
selected_features = np.array(feature_names)[rfe.support_]
print(selected_features)

Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
['alcohol' 'flavanoids' 'color_intensity' 'hue' 'proline']


## Hyperparameter optimization using Grid Search CV

## Wrapping base model (logistic regression) in OnevsRestClassifier model to allow multiclass classification

- Allows for use of the `liblinear` solver

In [11]:
from sklearn.multiclass import OneVsRestClassifier
base_model = LogisticRegression(max_iter = 1000)
model = OneVsRestClassifier(base_model)

In [12]:
#checking model parameters
model.get_params()

{'estimator__C': 1.0,
 'estimator__class_weight': None,
 'estimator__dual': False,
 'estimator__fit_intercept': True,
 'estimator__intercept_scaling': 1,
 'estimator__l1_ratio': None,
 'estimator__max_iter': 1000,
 'estimator__multi_class': 'deprecated',
 'estimator__n_jobs': None,
 'estimator__penalty': 'l2',
 'estimator__random_state': None,
 'estimator__solver': 'lbfgs',
 'estimator__tol': 0.0001,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': LogisticRegression(max_iter=1000),
 'n_jobs': None,
 'verbose': 0}

In [13]:
from sklearn.model_selection import GridSearchCV

grid = {"estimator__C" : [0.1, 1, 2, 10],
         "estimator__solver" : ["lbfgs", "liblinear"]}

gs_model = GridSearchCV(model, grid, cv=5, verbose=2)
gs_model.fit(X_train_rfe, y_train_no_outliers)

best_model = gs_model.best_estimator_
print(gs_model.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ..........estimator__C=0.1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ..........estimator__C=0.1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ..........estimator__C=0.1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ..........estimator__C=0.1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ..........estimator__C=0.1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ......estimator__C=0.1, estimator__solver=liblinear; total time=   0.0s
[CV] END ......estimator__C=0.1, estimator__solver=liblinear; total time=   0.0s
[CV] END ......estimator__C=0.1, estimator__solver=liblinear; total time=   0.0s
[CV] END ......estimator__C=0.1, estimator__solver=liblinear; total time=   0.0s
[CV] END ......estimator__C=0.1, estimator__solver=liblinear; total time=   0.0s
[CV] END ............estimator__C=1, estimator__solver=lbfgs; total time=   0.0s
[CV] END ............estimator__C=1, estimator__s

## Making predictions 

In [14]:
y_preds = best_model.predict(X_test_rfe)

## Evaluating Model

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

best_model_accuracy = accuracy_score(y_test, y_preds)
best_model_precision = precision_score(y_test, y_preds, average="weighted")
best_model_recall = recall_score(y_test, y_preds, average="weighted")
best_model_f1 = f1_score(y_test, y_preds, average="weighted")


print(f"Best Model Accuracy : {best_model_accuracy}")
print(f"Best Model Precision : {best_model_precision}") 
print(f"Best Model Recall : {best_model_recall}")
print(f"Best Model F1 score : {best_model_f1}")

bm_cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring="accuracy")
print(f"Best Model Cross validation accuracy scores : {bm_cv_scores}")

Best Model Accuracy : 0.9722222222222222
Best Model Precision : 0.9737654320987654
Best Model Recall : 0.9722222222222222
Best Model F1 score : 0.9721046443268665
Best Model Cross validation accuracy scores : [1.         0.93103448 0.96428571 0.89285714 0.92857143]
