In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
white_df = pd.read_csv(Path('winequality-white.csv'))
red_df = pd.read_csv(Path('winequality-red.csv'))

In [3]:
white_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [4]:
red_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [5]:
white_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [6]:
red_df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [7]:
#begin our models 
#assigning X, y variables
#X= all the independent features
#y= quality 
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = white_df[['fixed acidity','volatile acidity','citric acid', 'residual sugar',
              'chlorides','free sulfur dioxide','total sulfur dioxide',
              'density','pH','sulphates','alcohol']]
y = white_df['quality']

print("Shape: ", X.shape, y.shape)

Shape:  (4898, 11) (4898,)


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-1.00997621e+00, -7.72060072e-01,  3.74520539e-01, ...,
        -5.08304853e-01,  2.60903897e-01,  1.79525057e+00],
       [ 4.13956982e-01, -1.16603422e+00,  1.27063453e-01, ...,
         9.43178713e-01, -1.32306307e+00, -4.90876537e-01],
       [-1.79348514e-01, -1.06754068e+00, -5.32822109e-01, ...,
         1.40501439e+00,  2.60903897e-01, -9.92157764e-04],
       ...,
       [ 1.83789017e+00, -8.70553608e-01,  7.04463320e-01, ...,
        -8.38187482e-01,  1.40488004e+00,  4.88892222e-01],
       [-8.91315110e-01, -5.75072999e-01, -6.97793499e-01, ...,
         1.51460404e-01, -7.95074080e-01,  1.62302635e-01],
       [ 4.13956982e-01, -4.76579463e-01,  3.74520539e-01, ...,
        -6.40257905e-01, -4.43081421e-01, -1.22570311e+00]])

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [17]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
#drop the column that we wish to predict (Our Y)
white_wine_df= white_df.drop(['quality'], axis=1, inplace=True)
red_wine_df= red_df.drop(['quality'], axis=1, inplace=True)

In [None]:
#run statistical summary

In [None]:
#correlation analysis between dependent and independent variables from white_df and red_df 