## Working with the Wine Dataset

## Import Libraries

In [82]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline


## Load Data

In [83]:
data = load_wine()

X = data.data
y = data.target

In [84]:
# Create the dataframe
df = pd.DataFrame(X, columns = data.feature_names)
df = pd.DataFrame(X, columns = data.feature_names)

df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [85]:
df.shape

(178, 13)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

## Get Data - Select only the First 5 Features


In [87]:
XX = X[:,0:5]

dff = pd.DataFrame(XX, columns = ['alcohol',	'malic_acid',	'ash', 'alcalinity_of_ash',	'magnesium'])

In [88]:
dff.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium
0,14.23,1.71,2.43,15.6,127.0
1,13.2,1.78,2.14,11.2,100.0
2,13.16,2.36,2.67,18.6,101.0
3,14.37,1.95,2.5,16.8,113.0
4,13.24,2.59,2.87,21.0,118.0


## Split Data

In [89]:
RANDOM_STATE = 101

# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(dff, y, test_size=0.30, random_state=RANDOM_STATE)

## Prediction with Gaussian Naive Bayes

In [90]:
# Fit to data and predict using pipelined GNB
unscaled_clf = GaussianNB()
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

# Fit to data and predict using pipelined scaling, GNB
std_clf = make_pipeline(StandardScaler(), GaussianNB())

std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with GNB')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

print('\nPrediction accuracy for the standardized test dataset with GNB')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))


Prediction accuracy for the normal test dataset with GNB
90.74%


Prediction accuracy for the standardized test dataset with GNB
90.74%



## Prediction with Logistic Regression

In [91]:
# Fit to data and predict using pipelined LR
unscaled_clf = LogisticRegression(max_iter=2000)
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

# Fit to data and predict using pipelined scaling, LR
#std_clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000))
std_clf = make_pipeline(StandardScaler(), LogisticRegression())

std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with LR')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

print('\nPrediction accuracy for the standardized test dataset with LR')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))


Prediction accuracy for the normal test dataset with LR
88.89%


Prediction accuracy for the standardized test dataset with LR
92.59%



## Feature Selection with Recursive Feature Elimination Algorithm

Select top 3 most important (rank 1) features out of the five.

In [92]:
from sklearn.feature_selection import RFE
estimator=LogisticRegression(max_iter=2000)
selector = RFE(estimator, n_features_to_select=3, step=1)
selector = selector.fit(X_train, y_train)
print(selector.support_)
print(selector.ranking_)

[ True  True  True False False]
[1 1 1 2 3]


## Build a KNN model with k=5 with the top 3 features

In [93]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train[['alcohol','malic_acid','ash']], y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [94]:
pred = knn.predict(X_test[['alcohol','malic_acid','ash']])
acc = metrics.accuracy_score(pred, y_test)
print('{:.2%}\n'.format(acc))

83.33%



## Build a KNN model with k=3 with the rank 2 and rank 3 features out of five

In [95]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train[['alcalinity_of_ash',	'magnesium']], y_train)



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [96]:
pred = knn.predict(X_test[['alcalinity_of_ash',	'magnesium']])
acc = metrics.accuracy_score(pred, y_test)
print('{:.2%}\n'.format(acc))

66.67%

