In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [4]:
## PREPARE DATA ##
wine_df = pd.read_csv("../winequality.csv")

# Fill missing data with either random data or a category corresponding to "Unknown"
for column in wine_df.columns:
    if wine_df[column].isna().any() and pd.api.types.is_numeric_dtype(wine_df[column]):
        wine_df.loc[wine_df[column].isna(), column] = [i for i in np.random.choice(range(round(wine_df[column].min()), round(wine_df[column]. max())), wine_df[column].isna().sum())]
    elif wine_df[column].isna().any() and (pd.api.types.is_object_dtype(wine_df[column]) or pd.api.types.is_categorical_dtype(wine_df[column])):
        wine_df[column].fillna("Unknown")

# One-hot encode wine type
for column in wine_df.columns:
    if pd.api.types.is_categorical_dtype(wine_df[column]) or pd.api.types.is_object_dtype(wine_df[column]):
        one_hot = pd.get_dummies(wine_df[column], prefix=column)
        wine_df = wine_df.drop(column, axis = 1)
        wine_df = wine_df.join(one_hot)


In [5]:
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_red,type_white
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,0,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,0,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,0,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,1.00,11.2,6,1,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,1,0


In [15]:
def run_clf(X_train, X_test, y_train, y_test):
    ## XGBOOST ##
    # Initialize the XGBoost classifier
    xgb_clf = xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss',n_estimators=1000)

    # Train the classifier
    xgb_clf.fit(X_train, y_train)

    from sklearn.metrics import accuracy_score

    # Predictions on the test set
    xgb_y_pred = xgb_clf.predict(X_test)

    # Calculate the accuracy
    xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
    print("XGBoost accuracy:", xgb_accuracy)


    ## RANDOM FOREST ##
    # Initialize the RF classifier
    rf_clf = RandomForestClassifier()

    # Train the classifier
    rf_clf.fit(X_train, y_train)

    # Predictions on the test set
    rf_y_pred = rf_clf.predict(X_test)

    # Calculate the accuracy
    rf_accuracy = accuracy_score(y_test, rf_y_pred)
    print("RF accuracy:", rf_accuracy)

    print()

In [260]:
def run_bin_clf(X_train, X_test, y_train, y_test):
    ## XGBOOST ##
    # Initialize the XGBoost classifier
    xgb_clf = xgb.XGBClassifier(n_estimators = 1000)

    # Train the classifier
    xgb_clf.fit(X_train, y_train)

    from sklearn.metrics import accuracy_score

    # Predictions on the test set
    xgb_y_pred = xgb_clf.predict(X_test)

    # Calculate the accuracy
    xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
    print("XGBoost accuracy:", xgb_accuracy)


    ## RANDOM FOREST ##
    # Initialize the RF classifier
    rf_clf = RandomForestClassifier()

    # Train the classifier
    rf_clf.fit(X_train, y_train)

    # Predictions on the test set
    rf_y_pred = rf_clf.predict(X_test)

    # Calculate the accuracy
    rf_accuracy = accuracy_score(y_test, rf_y_pred)
    print("RF accuracy:", rf_accuracy)

    print()

In [248]:
## CLASSIFY WITHOUT REMOVING DATA FEATURES ##
X = wine_df.drop("quality", axis=1)
y = wine_df["quality"]
y = y - 3 # remap labels from 3-9 to 0-6

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print()
print("Accuracies without dropping features:")
run_clf(X_train, X_test, y_train, y_test)


Accuracies without dropping features:
XGBoost accuracy: 0.6630769230769231
RF accuracy: 0.6876923076923077



In [249]:
## CLASSIFY with binary categories

In [253]:
#define wine class [1 = 'Good Quality', 0 = 'Bad Quality']
wine_df['def_quality'] = [0 if x < 7 else 1 for x in wine_df['quality']]# Separate feature variables and target variable
X_binary = wine_df.drop(['quality','def_quality'], axis = 1)
y_binary = wine_df['def_quality']

In [254]:
y_binary.value_counts()

0    5220
1    1277
Name: def_quality, dtype: int64

In [255]:
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_binary, y_binary, stratify=y_binary, test_size=0.2, random_state=42)

print()
print("Accuracies without dropping features and binary classification:")
run_bin_clf(X_train, X_test, y_train, y_test)


Accuracies without dropping features and binary classification:
XGBoost accuracy: 0.88
RF accuracy: 0.8930769230769231



In [20]:
## Classification with three categories
#define wine class [2 = 'Good Quality', 1 = "Mediocre Quality", 0 = 'Bad Quality']
wine_df['def_quality'] = [0 if x < 4  else 1 if x==4 else 2 if x==5 else 3 if x <8  else 4 for x in wine_df['quality']]# Separate feature variables and target variable
X_triad = wine_df.drop(['quality','def_quality'], axis = 1)
y_triad = wine_df['def_quality']

In [21]:
y_triad.value_counts()

3    3915
2    2138
1     216
4     198
0      30
Name: def_quality, dtype: int64

In [22]:
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_triad, y_triad, stratify=y_triad, test_size=0.2, random_state=42)

print()
print("Accuracies without dropping features and five class classification:")
run_clf(X_train, X_test, y_train, y_test)


Accuracies without dropping features and three class classification:
XGBoost accuracy: 0.7815384615384615
RF accuracy: 0.7984615384615384



## Feature Selection with Most Important Features

In [261]:
# Checking for correlation between the important features
# If features are highly intercorrelated, we should only keep one and drop the other
# we should probably drop either red or white and maybe density since it is highly correlated with alcohol

# Correlation with target variable quality
cor = wine_df.corr()
cor_quality = abs(cor["quality"])

threshold = 0.075

# Selecting only features with correlation coefficient > threshold
important_features = cor_quality[cor_quality > threshold].sort_values()
display(important_features)
print(f"Number of most important featuers: {len(important_features) - 1}")

feature_cor = wine_df[list(important_features.iloc[:-1].index)].corr().abs()

# Select upper triangle of correlation matrix
feature_cor_upper = feature_cor.where(np.triu(np.ones(feature_cor.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
features_to_exclude = [column for column in feature_cor_upper.columns if any(feature_cor_upper[column] > 0.95)]

# Find features to be kept
features_to_be_kept = [feature for feature in important_features.index.to_list() if feature not in features_to_exclude]

# Drop features: drop all features that show a low correlation with the target variable and that are highly intercorrelated
for column in wine_df.columns:
    if column not in features_to_be_kept:
        wine_df.drop(column, axis=1, inplace=True)

fixed acidity       0.078026
citric acid         0.085780
type_white          0.119323
type_red            0.119323
chlorides           0.200278
volatile acidity    0.264573
density             0.305858
alcohol             0.444319
quality             1.000000
Name: quality, dtype: float64

Number of most important featuers: 8


In [262]:
## CLASSIFY AFTER FEATURE SELECTION ##
X = wine_df.drop("quality", axis=1)
y = wine_df["quality"]
y = y - 3 # remap labels from 3-9 to 0-6

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print("Accuracies with feature subset:")
run_clf(X_train, X_test, y_train, y_test)

Accuracies with feature subset:
XGBoost accuracy: 0.6276923076923077
RF accuracy: 0.6584615384615384



In [263]:
## CLASSIFY with binary categories

In [267]:
#define wine class [1 = 'Good Quality', 0 = 'Bad Quality']
wine_df['def_quality'] = [0 if x < 7 else 1 for x in wine_df['quality']]# Separate feature variables and target variable
X_binary = wine_df.drop(['quality','def_quality'], axis = 1)
y_binary = wine_df['def_quality']

In [268]:
y_binary.value_counts()

0    5220
1    1277
Name: def_quality, dtype: int64

In [269]:
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_binary, y_binary, stratify=y_binary, test_size=0.2, random_state=42)

print()
print("Accuracies with feature subset and binary classification:")
run_bin_clf(X_train, X_test, y_train, y_test)


Accuracies with feature subset and binary classification:
XGBoost accuracy: 0.8615384615384616
RF accuracy: 0.8746153846153846



In [1]:
## Classification with three categories
#define wine class [2 = 'Good Quality', 1 = "Mediocre Quality", 0 = 'Bad Quality']
wine_df['def_quality'] = [0 if x < 5  else 1 if x < 8 else 2 for x in wine_df['quality']]# Separate feature variables and target variable
X_triad = wine_df.drop(['quality','def_quality'], axis = 1)
y_triad = wine_df['def_quality']

NameError: name 'wine_df' is not defined

In [271]:
y_triad.value_counts()

1    3915
0    2384
2     198
Name: def_quality, dtype: int64

In [273]:
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X_triad, y_triad, stratify=y_triad, test_size=0.2, random_state=42)

print()
print("Accuracies with feature subset and three class classification:")
run_clf(X_train, X_test, y_train, y_test)


Accuracies with feature subset and three class classification:
XGBoost accuracy: 0.7707692307692308
RF accuracy: 0.8076923076923077



In [158]:
## split quality into good vs
y_test.value_counts()

3    567
2    428
4    216
1     43
5     39
0      6
6      1
Name: quality, dtype: int64

In [224]:
## FEATURE SELECTION ##
features = wine_df.loc[:, wine_df.columns != 'quality']
cor = abs(features.corr())
feature_cor_upper = cor.where(np.triu(np.ones(cor.shape), k=1).astype(bool))
display(feature_cor_upper)
features_to_exclude = [column for column in feature_cor_upper.columns if any(feature_cor_upper[column] > 0.71)]
print(f"features to exclude{features_to_exclude}")
# Find features to be kept
features_to_be_kept = [feature for feature in wine_df.columns if feature not in features_to_exclude]
print(f"features to be kept {features_to_be_kept}")
# Drop features: drop all features that show a low correlation with the target variable and that are highly intercorrelated
for column in wine_df.columns:
    if column not in features_to_be_kept:
        wine_df.drop(column, axis=1, inplace=True)



Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_red,type_white
fixed acidity,,0.221976,0.315517,0.108969,0.2936,0.281001,0.327434,0.453518,0.249711,0.296119,0.09375,0.482515,0.482515
volatile acidity,,,0.373779,0.194648,0.375192,0.34943,0.411522,0.270852,0.25621,0.2228,0.038394,0.649296,0.649296
citric acid,,,,0.140527,0.038836,0.132757,0.194515,0.095693,0.32813,0.059221,0.009973,0.18707,0.18707
residual sugar,,,,,0.128407,0.403239,0.492978,0.544934,0.262771,0.18131,0.356924,0.346156,0.346156
chlorides,,,,,,0.194886,0.279562,0.362519,0.045527,0.393436,0.256871,0.512675,0.512675
free sulfur dioxide,,,,,,,0.720934,0.025717,0.144851,0.187733,0.179838,0.471644,0.471644
total sulfur dioxide,,,,,,,,0.032395,0.236932,0.273385,0.26574,0.700357,0.700357
density,,,,,,,,,0.012061,0.258008,0.686745,0.390645,0.390645
pH,,,,,,,,,,0.187171,0.121035,0.328199,0.328199
sulphates,,,,,,,,,,,0.003263,0.483473,0.483473


features to exclude['total sulfur dioxide', 'type_white']
features to be kept ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'type_red']
