In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
red_df = pd.read_csv('data/wine_quality_dataset/winequality-red.csv', sep=';')
red_df['type'] = 'red'
red_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,red
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,red
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,red
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,red


In [3]:
white_df = pd.read_csv('data/wine_quality_dataset/winequality-white.csv', sep=';')
white_df['type'] = 'white'
white_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,white
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,white
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,white
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [4]:
df = pd.concat([red_df, white_df])
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


In [5]:
sampled_df = df.sample(frac=1)
sampled_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
3485,7.5,0.210,0.32,4.80,0.056,39.0,113.0,0.99393,3.11,0.52,10.2,7,white
4839,5.2,0.405,0.15,1.45,0.038,10.0,44.0,0.99125,3.52,0.40,11.6,4,white
3633,6.6,0.300,0.25,8.00,0.036,21.0,124.0,0.99362,3.06,0.38,10.8,6,white
189,7.9,0.490,0.32,1.90,0.082,17.0,144.0,0.99680,3.20,0.55,9.5,5,red
3197,7.9,0.340,0.44,6.50,0.027,47.0,126.0,0.99124,2.96,0.37,12.5,6,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3689,7.0,0.220,0.26,9.20,0.027,37.0,122.0,0.99228,3.06,0.34,12.5,8,white
13,6.6,0.160,0.40,1.50,0.044,48.0,143.0,0.99120,3.54,0.52,12.4,7,white
961,8.0,0.420,0.36,5.00,0.037,34.0,101.0,0.99200,3.13,0.57,12.3,7,white
31,6.9,0.685,0.00,2.50,0.105,22.0,37.0,0.99660,3.46,0.57,10.6,6,red


In [6]:
X = sampled_df.drop(columns='type')
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3485,7.5,0.210,0.32,4.80,0.056,39.0,113.0,0.99393,3.11,0.52,10.2,7
4839,5.2,0.405,0.15,1.45,0.038,10.0,44.0,0.99125,3.52,0.40,11.6,4
3633,6.6,0.300,0.25,8.00,0.036,21.0,124.0,0.99362,3.06,0.38,10.8,6
189,7.9,0.490,0.32,1.90,0.082,17.0,144.0,0.99680,3.20,0.55,9.5,5
3197,7.9,0.340,0.44,6.50,0.027,47.0,126.0,0.99124,2.96,0.37,12.5,6
...,...,...,...,...,...,...,...,...,...,...,...,...
3689,7.0,0.220,0.26,9.20,0.027,37.0,122.0,0.99228,3.06,0.34,12.5,8
13,6.6,0.160,0.40,1.50,0.044,48.0,143.0,0.99120,3.54,0.52,12.4,7
961,8.0,0.420,0.36,5.00,0.037,34.0,101.0,0.99200,3.13,0.57,12.3,7
31,6.9,0.685,0.00,2.50,0.105,22.0,37.0,0.99660,3.46,0.57,10.6,6


In [7]:
y = sampled_df['type']
y

3485    white
4839    white
3633    white
189       red
3197    white
        ...  
3689    white
13      white
961     white
31        red
1821    white
Name: type, Length: 6497, dtype: object

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1511)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5197, 12), (1300, 12), (5197,), (1300,))

In [10]:
logistic_regression_model = LogisticRegression()
logistic_regression_model

In [11]:
logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
y_pred = logistic_regression_model.predict(X_test)
y_pred

array(['red', 'white', 'white', ..., 'red', 'white', 'white'],
      dtype=object)

In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         red       0.95      0.96      0.95       327
       white       0.99      0.98      0.98       973

    accuracy                           0.98      1300
   macro avg       0.97      0.97      0.97      1300
weighted avg       0.98      0.98      0.98      1300

