In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [46]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ';')
y = df.pop('quality')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 11 columns):
fixed acidity           4898 non-null float64
volatile acidity        4898 non-null float64
citric acid             4898 non-null float64
residual sugar          4898 non-null float64
chlorides               4898 non-null float64
free sulfur dioxide     4898 non-null float64
total sulfur dioxide    4898 non-null float64
density                 4898 non-null float64
pH                      4898 non-null float64
sulphates               4898 non-null float64
alcohol                 4898 non-null float64
dtypes: float64(11)
memory usage: 421.0 KB


In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [6]:
y.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [47]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state=101 )

In [43]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.5428571428571428


In [10]:
def fit_predict(train, test, y_train, y_test, scaler, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)        
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

In [11]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.5989795918367347


### Max depth tuning

In [12]:
for i in range(1, 20):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), i)

Accuracy score using max_depth = 1: 0.44081632653061226
Accuracy score using max_depth = 2: 0.44081632653061226
Accuracy score using max_depth = 3: 0.4530612244897959
Accuracy score using max_depth = 4: 0.4602040816326531
Accuracy score using max_depth = 5: 0.48673469387755103
Accuracy score using max_depth = 6: 0.45918367346938777
Accuracy score using max_depth = 7: 0.49795918367346936
Accuracy score using max_depth = 8: 0.503061224489796
Accuracy score using max_depth = 9: 0.5183673469387755
Accuracy score using max_depth = 10: 0.4969387755102041
Accuracy score using max_depth = 11: 0.5142857142857142
Accuracy score using max_depth = 12: 0.49183673469387756
Accuracy score using max_depth = 13: 0.576530612244898
Accuracy score using max_depth = 14: 0.5714285714285714
Accuracy score using max_depth = 15: 0.5336734693877551
Accuracy score using max_depth = 16: 0.5489795918367347
Accuracy score using max_depth = 17: 0.5520408163265306
Accuracy score using max_depth = 18: 0.57755102040816

### Max features tuning

In [13]:
for i in np.arange(0.1, 1.0, 0.1):
    print('Accuracy score using max features =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), max_depth = 18, max_features=i)

Accuracy score using max features = 0.1: 0.5775510204081633
Accuracy score using max features = 0.2: 0.6112244897959184
Accuracy score using max features = 0.30000000000000004: 0.6071428571428571
Accuracy score using max features = 0.4: 0.5959183673469388
Accuracy score using max features = 0.5: 0.6030612244897959
Accuracy score using max features = 0.6: 0.5826530612244898
Accuracy score using max features = 0.7000000000000001: 0.6020408163265306
Accuracy score using max features = 0.8: 0.5724489795918367
Accuracy score using max features = 0.9: 0.573469387755102


### Min samples split tuning

In [14]:
for i in range(2, 10):
    print('Accuracy score using min samples split =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, max_features=0.3, min_samples_split=i)

Accuracy score using min samples split = 2: 0.6153061224489796
Accuracy score using min samples split = 3: 0.5928571428571429
Accuracy score using min samples split = 4: 0.6071428571428571
Accuracy score using min samples split = 5: 0.5612244897959183
Accuracy score using min samples split = 6: 0.5795918367346938
Accuracy score using min samples split = 7: 0.563265306122449
Accuracy score using min samples split = 8: 0.5938775510204082
Accuracy score using min samples split = 9: 0.5540816326530612


### Criterion tuning

In [15]:
for i in ['gini', 'entropy']:
    print('Accuracy score using criterion =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = i)

Accuracy score using criterion = gini: 0.6122448979591837
Accuracy score using criterion = entropy: 0.6153061224489796


In [16]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [17]:
for degree in [1,2,3,4]:
    train_poly, test_poly = create_poly(train, test, degree)
    print('Polynomial degree',degree)
    fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')
    print(10*'-')
    
train_poly, test_poly = create_poly(train, test, 2) 

Polynomial degree 1
0.6061224489795919
----------
Polynomial degree 2
0.6265306122448979
----------
Polynomial degree 3
0.6255102040816326
----------
Polynomial degree 4
0.6091836734693877
----------


In [67]:
from sklearn.tree import export_text

ImportError: cannot import name 'export_text'

In [18]:
def feat_eng(df):
    df['eng1'] = df['fixed acidity'] * df['pH']
    df['eng2'] = df['total sulfur dioxide'] / df['free sulfur dioxide']
    df['eng3'] = df['sulphates'] / df['chlorides']
    df['eng4'] = df['chlorides'] / df['sulphates']
    return df

train = feat_eng(train)
test = feat_eng(test)

print('Additional feature engineering:')

fit_predict(train, test, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')

train_poly, test_poly = create_poly(train, test, 2)

fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 18, 
                max_features=0.3, min_samples_split=2, criterion = 'entropy')


Additional feature engineering:
0.5938775510204082
0.6173469387755102


In [63]:
train = feat_eng(df)

In [64]:
train.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'eng1', 'eng2', 'eng3', 'eng4'],
      dtype='object')

In [19]:
original_score = 0.514285714286
best_score = 0.625510204082
improvement = np.abs(np.round(100*(original_score - best_score)/original_score,2))
print('overall improvement is {} %'.format(improvement))

overall improvement is 21.63 %


In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rf = RandomForestClassifier(criterion='gini')

In [50]:
rf.fit(train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [51]:
pred_rf = rf.predict(test)

In [52]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_rf))

0.6306122448979592


In [54]:
from sklearn.model_selection import cross_val_score

In [55]:
cross_val_accuracy = cross_val_score(rf, df, y, scoring='accuracy', cv=10)

In [56]:
cross_val_accuracy

array([0.44918699, 0.48577236, 0.41869919, 0.46843177, 0.47454175,
       0.5603272 , 0.47034765, 0.54303279, 0.52772074, 0.46201232])

In [58]:
from sklearn.metrics import confusion_matrix, classification_report

In [60]:
print(confusion_matrix(y_test,pred_rf))

[[  0   0   3   2   0   0]
 [  0   7  19  14   0   1]
 [  0   5 215  81   5   0]
 [  0   2  85 292  51   3]
 [  0   1   6  61  87   3]
 [  0   0   1  10   9  17]]


In [61]:
print(classification_report(y_test,pred_rf))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.47      0.17      0.25        41
           5       0.65      0.70      0.68       306
           6       0.63      0.67      0.65       433
           7       0.57      0.55      0.56       158
           8       0.71      0.46      0.56        37

   micro avg       0.63      0.63      0.63       980
   macro avg       0.51      0.43      0.45       980
weighted avg       0.62      0.63      0.62       980



In [57]:
cross_val_accuracy.mean()

0.48600727530662213

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
params = {'n_estimators':[200,500,700],'max_depth':[10,15,18,20],
         'min_samples_leaf':[3,5,7]}

In [27]:
gs = GridSearchCV(rf,params,verbose=3)

In [28]:
gs.fit(train,y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.617737003058104, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.6090283091048202, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=200 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.6s remaining:    0.0s


[CV]  max_depth=10, min_samples_leaf=3, n_estimators=200, score=0.6062931696085956, total=   1.6s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.6146788990825688, total=   4.1s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.6105585309869931, total=   4.1s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=500 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=500, score=0.6047582501918649, total=   4.1s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=700, score=0.6131498470948012, total=   5.8s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=700 ..............
[CV]  max_depth=10, min_samples_leaf=3, n_estimators=700, score=0.6159143075745983, total=   5.8s
[CV] max_depth=10, min_samples_leaf=3, n_estimators=700 .

[CV]  max_depth=15, min_samples_leaf=7, n_estimators=700, score=0.6123853211009175, total=   5.7s
[CV] max_depth=15, min_samples_leaf=7, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=7, n_estimators=700, score=0.6113236419280795, total=   5.6s
[CV] max_depth=15, min_samples_leaf=7, n_estimators=700 ..............
[CV]  max_depth=15, min_samples_leaf=7, n_estimators=700, score=0.6001534919416731, total=   5.7s
[CV] max_depth=18, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=3, n_estimators=200, score=0.6444954128440367, total=   1.9s
[CV] max_depth=18, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=3, n_estimators=200, score=0.6296863045141545, total=   1.8s
[CV] max_depth=18, min_samples_leaf=3, n_estimators=200 ..............
[CV]  max_depth=18, min_samples_leaf=3, n_estimators=200, score=0.6201074443591711, total=   1.8s
[CV] max_depth=18, min_samples_leaf=3, n_estimators=500 .

[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.6151491966335119, total=   1.5s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=200 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=200, score=0.6032233307751343, total=   1.6s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=500 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=500, score=0.6108562691131498, total=   4.1s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=500 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=500, score=0.6212700841622035, total=   4.1s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=500 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=500, score=0.6032233307751343, total=   4.1s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=700 ..............
[CV]  max_depth=20, min_samples_leaf=7, n_estimators=700, score=0.6123853211009175, total=   5.7s
[CV] max_depth=20, min_samples_leaf=7, n_estimators=700 .

[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  8.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 500, 700], 'max_depth': [10, 15, 18, 20], 'min_samples_leaf': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [29]:
gs.best_params_

{'max_depth': 18, 'min_samples_leaf': 3, 'n_estimators': 500}

In [30]:
gs.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
rf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=True, random_state=101, verbose=0,
            warm_start=False)

In [32]:
rf1.fit(train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=True, random_state=101, verbose=0, warm_start=False)

In [33]:
rf1.oob_score_

0.6510974987238387

In [34]:
pred_rf1 = rf1.predict(test)

In [35]:
print(accuracy_score(y_test,pred_rf1))

0.6602040816326531


In [36]:
rf1.feature_importances_

array([0.04544561, 0.08698744, 0.06084217, 0.06661011, 0.05480626,
       0.0628811 , 0.06659388, 0.08465859, 0.06012223, 0.04804436,
       0.10510437, 0.06282237, 0.07250678, 0.06036186, 0.06221289])

In [65]:
sorted(list(zip(rf1.feature_importances_,train.columns)),reverse=True)

[(0.1051043658788977, 'alcohol'),
 (0.08698743637477381, 'volatile acidity'),
 (0.08465859448232203, 'density'),
 (0.07250677598508122, 'eng2'),
 (0.06661010905073897, 'residual sugar'),
 (0.06659388489892111, 'total sulfur dioxide'),
 (0.06288109519465297, 'free sulfur dioxide'),
 (0.06282236664633044, 'eng1'),
 (0.06221288536723693, 'eng4'),
 (0.060842174325342446, 'citric acid'),
 (0.06036185698627452, 'eng3'),
 (0.06012222926149238, 'pH'),
 (0.05480625829876283, 'chlorides'),
 (0.048044355216732615, 'sulphates'),
 (0.04544561203243984, 'fixed acidity')]

In [24]:
df.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
fixed acidity,1.0,-0.022697,0.289181,0.089021,0.023086,-0.049396,0.09107,0.265331,-0.425858,-0.017143,-0.120881
volatile acidity,-0.022697,1.0,-0.149472,0.064286,0.070512,-0.097012,0.089261,0.027114,-0.031915,-0.035728,0.067718
citric acid,0.289181,-0.149472,1.0,0.094212,0.114364,0.094077,0.121131,0.149503,-0.163748,0.062331,-0.075729
residual sugar,0.089021,0.064286,0.094212,1.0,0.088685,0.299098,0.401439,0.838966,-0.194133,-0.026664,-0.450631
chlorides,0.023086,0.070512,0.114364,0.088685,1.0,0.101392,0.19891,0.257211,-0.090439,0.016763,-0.360189
free sulfur dioxide,-0.049396,-0.097012,0.094077,0.299098,0.101392,1.0,0.615501,0.29421,-0.000618,0.059217,-0.250104
total sulfur dioxide,0.09107,0.089261,0.121131,0.401439,0.19891,0.615501,1.0,0.529881,0.002321,0.134562,-0.448892
density,0.265331,0.027114,0.149503,0.838966,0.257211,0.29421,0.529881,1.0,-0.093591,0.074493,-0.780138
pH,-0.425858,-0.031915,-0.163748,-0.194133,-0.090439,-0.000618,0.002321,-0.093591,1.0,0.155951,0.121432
sulphates,-0.017143,-0.035728,0.062331,-0.026664,0.016763,0.059217,0.134562,0.074493,0.155951,1.0,-0.017433


In [25]:
from sklearn.feature_selection import RFE

In [26]:
rfe =RFE(rf1,n_features_to_select=5)

In [27]:
rfe.fit(train,y_train)

RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=True, random_state=101, verbose=0, warm_start=False),
  n_features_to_select=5, step=1, verbose=0)

In [30]:
rfe.get_support()

array([False,  True, False,  True, False, False,  True,  True, False,
       False,  True])

In [32]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')