Today's objective : classify wines into 3 categories with Logistic Regression and Decision Tree.

In [93]:
# Loading the data and check how many are there
import pandas as pd
from sklearn.datasets import load_wine

data = load_wine()
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target)

print(X.shape)
print(y.value_counts())

(178, 13)
1    71
0    59
2    48
Name: count, dtype: int64


X and y has the same number of rows, nice.

In [95]:
# Take a look at the dataset
X.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


X has no missing values, nice.
But the value of each features varies a lot, need to standardize (for Logistic Regression).

In [97]:
# Making pipelines (Logistic Regression and Decision Tree)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

logreg_pipeline = Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression(solver = 'liblinear', random_state = 42))])

tree_pipeline = Pipeline([('tree', DecisionTreeClassifier(random_state = 42))])

Now we train and test.

In [99]:
# Making training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [107]:
# Training the models
from sklearn.model_selection import GridSearchCV

logreg_parm_grid = {'logreg__C':[0.01, 0.1, 1, 10, 100]}
logreg_grid = GridSearchCV(logreg_pipeline, logreg_parm_grid, cv = 5)
logreg_grid.fit(X_train, y_train)

tree_parm_grid = {'tree__max_depth': [3, 5, 10, None], 'tree__min_samples_leaf': [1, 3, 5]}
tree_grid = GridSearchCV(tree_pipeline, tree_parm_grid, cv = 5)
tree_grid.fit(X_train, y_train)

In [128]:
# Checking which one is the best and how best is it for each model
logreg_cv_results = pd.DataFrame(logreg_grid.cv_results_)
logreg_cv_results[['param_logreg__C', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,param_logreg__C,mean_test_score,std_test_score,rank_test_score
0,0.01,0.957635,0.026864,5
1,0.1,0.978818,0.017301,3
2,1.0,0.985961,0.017199,1
3,10.0,0.979064,0.0171,2
4,100.0,0.965025,0.021817,4


In [117]:
tree_cv_results = pd.DataFrame(tree_grid.cv_results_)
tree_cv_results[['param_tree__max_depth', 'param_tree__min_samples_leaf', 'mean_test_score', 'std_test_score', 'rank_test_score']]

Unnamed: 0,param_tree__max_depth,param_tree__min_samples_leaf,mean_test_score,std_test_score,rank_test_score
0,3.0,1,0.922414,0.014819,1
1,3.0,3,0.908374,0.017566,8
2,3.0,5,0.900985,0.042337,9
3,5.0,1,0.915271,0.018323,5
4,5.0,3,0.915517,0.017057,2
5,5.0,5,0.900985,0.042337,9
6,10.0,1,0.915271,0.018323,5
7,10.0,3,0.915517,0.017057,2
8,10.0,5,0.900985,0.042337,9
9,,1,0.915271,0.018323,5


Since the best params for Logistic Regression(0.985961) outperform the best of Decision Tree(0.922414), I decide to use Logisitic Regression for the final test set.

In [134]:
# Predicting and evaluating the accuracy
best_logreg = logreg_grid.best_estimator_
test_score = best_logreg.score(X_test, y_test)

print("Test set accuracy:", test_score)

Test set accuracy: 1.0


The accuracy is 1.0, very nice.