In [9]:
# baseline model performance on the wine dataset
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.feature_selection import RFE

In [2]:
# define the location of the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/wine.csv'
# load the dataset as a data frame
df = pd.read_csv(url, header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


In [5]:
data = df.values
X,y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

X = X.astype('float')
y = LabelEncoder().fit_transform(y.astype('str'))

model = LogisticRegression(solver='liblinear')
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=3)

scores = cross_val_score(model, X,y, scoring='accuracy',cv=cv, n_jobs=1)

print(f"Accuracy {np.mean(scores)}  Standard Deviation: {np.std(scores)}")

(178, 13) (178,)
Accuracy 0.952723311546841  Standard Deviation: 0.06336686606145052


In [11]:
transforms = list()
transforms.append(('mms', MinMaxScaler()))
transforms.append(('ss', StandardScaler()))
transforms.append(('rs', RobustScaler()))
transforms.append(('qt', QuantileTransformer(n_quantiles=100, output_distribution='normal')))
transforms.append(('kbd', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')))
transforms.append(('pca', PCA(n_components=7)))
transforms.append(('svd', TruncatedSVD(n_components=7)))

fu = FeatureUnion(transforms)
rfe = RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=15)

steps = list()
steps.append(('fu', fu))
steps.append(('rfe', rfe))
steps.append(('m', model))

pipeline = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=3)
scores = cross_val_score(pipeline, X,y, scoring='accuracy',cv=cv, n_jobs=1)

print(f"Accuracy {np.mean(scores)}  Standard Deviation: {np.std(scores)}")

Accuracy 0.9886710239651415  Standard Deviation: 0.02266842392500143
