#### Data: https://archive.ics.uci.edu/ml/datasets/wine+quality

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
from sklearn import metrics

In [2]:
df = pd.read_csv("winequality-white.csv", sep=';')
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
df['quality'].value_counts().sort_index()

3      20
4     163
5    1457
6    2198
7     880
8     175
9       5
Name: quality, dtype: int64

In [4]:
y_data = df.quality
X_data = df.drop('quality', axis=1)

In [5]:
def fit_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.5)
    # scale the train, test data
    X_train_scale = scale(X_train)
    X_test_scale = scale(X_test)
    model.fit(X_train_scale, y_train)
    y_pred = model.predict(X_test_scale)
    score = model.score(X_test_scale, y_test)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(score))
    cm = metrics.confusion_matrix(y_test, y_pred)
    return

In [6]:
logreg = LogisticRegression(penalty='l2', C=.01)
fit_model(X_data, y_data, logreg)

Accuracy of logistic regression classifier on test set: 0.53


#### lets try changing some parameters to see if we can improve the accuracy
#### for muliclass problems, newton-cg can be used it seems

In [7]:
logreg = LogisticRegression(solver='newton-cg', penalty='l2',C=.01)
fit_model(X_data, y_data, logreg)

Accuracy of logistic regression classifier on test set: 0.53


#### set multi_class to multinomial

In [8]:
logreg = LogisticRegression(solver='newton-cg', multi_class='multinomial', penalty='l2', C=.01)
fit_model(X_data, y_data, logreg)

Accuracy of logistic regression classifier on test set: 0.51


#### set class_weight to balanced

In [9]:
logreg = LogisticRegression(solver='newton-cg', multi_class='multinomial', class_weight='balanced', penalty='l2', C=.01)
fit_model(X_data, y_data, logreg)

Accuracy of logistic regression classifier on test set: 0.30


#### try max iter to 500

In [10]:
logreg = LogisticRegression(solver='newton-cg', multi_class='multinomial', class_weight='balanced', max_iter=500, penalty='l2', C=.01)
fit_model(X_data, y_data, logreg)

Accuracy of logistic regression classifier on test set: 0.29


#### grid searching with k folds

In [11]:
penalty = ['l1', 'l2']
C = np.logspace(0, 3, 9)
hyperparameters = dict(C=C, penalty=penalty)
logreg = LogisticRegression()
clf = GridSearchCV(logreg, hyperparameters, cv=15, verbose=0)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,test_size=0.5)

X_train_scale = scale(X_train)
X_test_scale = scale(X_test)

best_model = clf.fit(X_train_scale, y_train)



In [13]:
print 'Best Penalty:', best_model.best_estimator_.get_params()['penalty']
print 'Best C:', best_model.best_estimator_.get_params()['C']
y_pred = best_model.predict(X_test_scale)
score = best_model.score(X_test_scale, y_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(score))

Best Penalty: l1
Best C: 2.371373705661655
Accuracy of logistic regression classifier on test set: 0.54
