In [363]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [364]:
red = pd.read_csv("winequality-red.csv")
white = pd.read_csv("winequality-white.csv")

def define_quality_labels(v):
  if v <= 5:
    return 'low'
  if v <= 7:
    return 'medium'
  return 'high'

red.columns = red.columns.str.replace(" ", "_")
white.columns = white.columns.str.replace(" ", "_")

In [365]:
# red['type'] = 0
# white['type'] = 1

red['quality_label'] = red['quality'].apply(define_quality_labels)
white['quality_label'] = white['quality'].apply(define_quality_labels)

white['quality_label'] = pd.Categorical(white['quality_label'], categories=['low', 'medium', 'high'])
red['quality_label'] = pd.Categorical(red['quality_label'], categories=['low', 'medium', 'high'])

# white['bound_sulfur_dioxide'] = white['total_sulfur_dioxide'] - white['free_sulfur_dioxide']
# red['bound_sulfur_dioxide'] = red['total_sulfur_dioxide'] - red['free_sulfur_dioxide']

combined = pd.concat([red, white], ignore_index=True)


# dfml_red = red.copy()
dfml_white = white.copy()

dfml_white
# dfml_white.info()

# manual attempt at outlier removal 
# dfml_white = dfml_white[(dfml_white['total_sulfur_dioxide'] < 350) & (dfml_white['free_sulfur_dioxide'] < 200) & (dfml_white['residual_sugar'] < 50)]

# use interquartile range to remove outliers
# Q1 = dfml_white.quantile(0.25)
# Q3 = dfml_white.quantile(0.75)
# IQR = Q3 - Q1
# # print(IQR)
# new_df = dfml_white[~((dfml_white < (Q1 - 1.5 * IQR)) |(dfml_white > (Q3 + 1.5 * IQR))).any(axis=1)]
# new_df

dfml_white.drop(['quality', 'density'], axis=1, inplace=True)
# dfml_white


In [None]:
# encoding
# cat_col_encode = pd.get_dummies(dfml_white['quality_label'])
# dfml_white = pd.concat([dfml_white, cat_col_encode], axis=1)
# dfml_white.drop(columns=['free_sulfur_dioxide', 'quality_label', 'quality'], axis=1, inplace=True)
# dfml_white.head()

encoder = OrdinalEncoder()
encoder.fit(dfml_white[['quality_label']])
dfml_white[['quality_label']] = encoder.transform(dfml_white[['quality_label']])

dfml_white

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,pH,sulphates,alcohol,quality_label
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,3.00,0.45,8.8,2.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,3.30,0.49,9.5,2.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,3.26,0.44,10.1,2.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,3.19,0.40,9.9,2.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,3.19,0.40,9.9,2.0
...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,3.27,0.50,11.2,2.0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,3.15,0.46,9.6,1.0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,2.99,0.46,9.4,2.0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,3.34,0.38,12.8,2.0


In [367]:
y = dfml_white['quality_label']
X = dfml_white.drop(['quality_label'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [368]:
# scaler = StandardScaler()
# scaler = MinMaxScaler()
from sklearn.preprocessing import RobustScaler


scaler = RobustScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.transform(X_test)

In [369]:
# LogisticRegression model
# LR_model = LogisticRegression(solver='liblinear', random_state=42)
LR_model = LogisticRegression(random_state=42, max_iter=10000)

# fit model
LR_model.fit(X_train_standard, y_train)

In [370]:
preds = LR_model.predict(X_test_standard)

acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)
print("accuracy: ", acc)
print("confusion_matrix: \n", cm)

# slightly improved by removing free + total SO2

accuracy:  0.7071428571428572
confusion_matrix: 
 [[  0   0  35]
 [  1 158 162]
 [  0  89 535]]


In [375]:
# Random Forest Classifier model
RF_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# fit model
RF_clf.fit(X_train, y_train)

In [None]:
# predictions
preds = RF_clf.predict(X_test)

# accuracy
acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)

print(acc)
print("confusion_matrix: \n", cm)

0.8295918367346938
confusion_matrix: 
 [[ 15   0  20]
 [  0 235  86]
 [  0  61 563]]
