In [8]:
# winequality-red.csv and winequality-white.csv from the Piazza Resources page were sourced from https://archive.ics.uci.edu/ml/datasets/wine+quality
# With these two CSV files, use multiple feature engineering approaches, multiple classifiers, multiple regressors to predict the score of red and/or white wines

In [9]:
import pandas as pd

red_df = pd.read_csv("winequality-red.csv", delimiter=';')
white_df = pd.read_csv("winequality-white.csv", delimiter=';')

In [10]:
red_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [11]:
white_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [12]:
red_df["quality"].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [13]:
white_df["quality"].value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier


def classify_regress(*, X, y_continuous, y_discrete):
    for regressor in (
        LinearRegression(),
        DecisionTreeRegressor(),
        RandomForestRegressor(),
    ):
        print(" ", regressor.__class__.__name__, "score:", cross_val_score(regressor, X, y_continuous, cv=5).mean())

    for classifier in (
            DecisionTreeClassifier(),
            AdaBoostClassifier(),
            RandomForestClassifier()
    ):
        for scoring in ("balanced_accuracy", "f1_weighted"):
            print(" ", classifier.__class__.__name__, scoring, "without discretization:", cross_val_score(classifier, X, y_continuous, cv=5, scoring=scoring).mean())
            print(" ", classifier.__class__.__name__, scoring, f"with discretization (value counts={y_discrete.value_counts().tolist()})", cross_val_score(classifier, X, y_discrete, cv=5, scoring=scoring).mean())


for type_df in (red_df, white_df):
    print("Red:" if id(type_df) == id(red_df) else "White:")

    X = type_df.drop(columns="quality")
    y_continuous = type_df["quality"]

    y_discrete = pd.qcut(y_continuous, 2, labels=[0, 1])
    print("Initial discretization:", y_discrete.value_counts().tolist())

    print("Classify+regress using data as-is:")
    classify_regress(X=X, y_continuous=y_continuous, y_discrete=y_discrete)
    print()

    print("Scale X:")
    X_scaled = StandardScaler().fit_transform(X)
    classify_regress(X=X_scaled, y_continuous=y_continuous, y_discrete=y_discrete)
    print()

    pca = PCA(0.9).fit(X_scaled)
    X_pca = pca.transform(X_scaled)
    print(f"Classify+regress using data after PCA capturing 0.9 variance (n_components={pca.n_components_}):")
    classify_regress(X=X_pca, y_continuous=y_continuous, y_discrete=y_discrete)
    print()

    rfc = RandomForestRegressor().fit(X, y_continuous)
    feature_importances = {column: feature_importance for column, feature_importance in zip(X.columns, rfc.feature_importances_)}
    print("Feature importances in descending order:")
    sorted_feature_importances = tuple(sorted(feature_importances.items(), key=lambda item: item[1], reverse=True))
    for feature, feature_importance in sorted_feature_importances:
        print(feature, "=", feature_importance)
    X_important = X.filter([feature_importance[0] for feature_importance in sorted_feature_importances[:5]])
    X_important = X.filter([feature_importance[0] for feature_importance in sorted_feature_importances[:5]])
    print(f"Classify+regress using top 5 features ({tuple(X_important.columns)}:")
    classify_regress(X=X_important, y_continuous=y_continuous, y_discrete=y_discrete)
    print()

    # Under-sample to balance the classes
    print("Undersampled to balance the classes:")
    type_df_undersampled = type_df.copy()
    type_df_undersampled["quality_bin"] = y_discrete
    quality_bin_0_count, quality_bin_1_count = y_discrete.value_counts()
    type_df_quality_bin_0 = type_df_undersampled[type_df_undersampled["quality_bin"] == 0]
    type_df_quality_bin_1 = type_df_undersampled[type_df_undersampled["quality_bin"] == 1]
    type_df_undersampled = pd.concat([type_df_quality_bin_0.sample(quality_bin_1_count), type_df_quality_bin_1], axis=0)
    X_undersampled = type_df_undersampled.drop(columns=["quality", "quality_bin"])
    y_continuous_undersampled = type_df_undersampled["quality"]
    y_discrete_undersampled = type_df_undersampled["quality_bin"]
    classify_regress(X=X_undersampled, y_continuous=y_continuous_undersampled, y_discrete=y_discrete_undersampled)
    print()


Red:
Initial discretization: [1382, 217]
Classify+regress using data as-is:
  LinearRegression score: 0.2900416288421968
  DecisionTreeRegressor score: -0.34410493470485276
  RandomForestRegressor score: 0.307039237139175
  DummyClassifier accuracy without discretization: 0.4258914576802508
  DummyClassifier accuracy with discretization (value counts=[1382, 217]) 0.8642907523510971
  DummyClassifier balanced_accuracy without discretization: 0.16666666666666666
  DummyClassifier balanced_accuracy with discretization (value counts=[1382, 217]) 0.5
  DummyClassifier f1_weighted without discretization: 0.2544152663102543
  DummyClassifier f1_weighted with discretization (value counts=[1382, 217]) 0.8013762078250879
  DecisionTreeClassifier accuracy without discretization: 0.4915654388714733
  DecisionTreeClassifier accuracy with discretization (value counts=[1382, 217]) 0.8042456896551725
  DecisionTreeClassifier balanced_accuracy without discretization: 0.28499127181515327
  DecisionTreeC



  DecisionTreeClassifier accuracy with discretization (value counts=[1060, 1060]) 0.6495283018867924
  DecisionTreeClassifier balanced_accuracy without discretization: 0.2485105891355001
  DecisionTreeClassifier balanced_accuracy with discretization (value counts=[1060, 1060]) 0.6514150943396226




  DecisionTreeClassifier f1_weighted without discretization: 0.44542633095492085
  DecisionTreeClassifier f1_weighted with discretization (value counts=[1060, 1060]) 0.6423006107448094




  AdaBoostClassifier accuracy without discretization: 0.4735849056603773
  AdaBoostClassifier accuracy with discretization (value counts=[1060, 1060]) 0.7051886792452831




  AdaBoostClassifier balanced_accuracy without discretization: 0.23705642272317248
  AdaBoostClassifier balanced_accuracy with discretization (value counts=[1060, 1060]) 0.7051886792452831




  AdaBoostClassifier f1_weighted without discretization: 0.3771684650885025
  AdaBoostClassifier f1_weighted with discretization (value counts=[1060, 1060]) 0.703557355258328




  RandomForestClassifier accuracy without discretization: 0.5207547169811322
  RandomForestClassifier accuracy with discretization (value counts=[1060, 1060]) 0.7264150943396226




  RandomForestClassifier balanced_accuracy without discretization: 0.27677159804171625
  RandomForestClassifier balanced_accuracy with discretization (value counts=[1060, 1060]) 0.725943396226415




  RandomForestClassifier f1_weighted without discretization: 0.503178154802659
  RandomForestClassifier f1_weighted with discretization (value counts=[1060, 1060]) 0.7238384959521051

