In [11]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [12]:
wine_df = pd.read_csv("preprocessed_wine_data.csv", sep=",")
wine_df.head(20)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.5,0.33,0.32,11.1,0.036,25.0,119.0,0.9962,3.15,0.34,10.5,6
1,6.3,0.27,0.29,12.2,0.044,59.0,196.0,0.99782,3.14,0.4,8.8,6
2,7.0,0.3,0.51,13.6,0.05,40.0,168.0,0.9976,3.07,0.52,9.6,7
3,7.4,0.38,0.27,7.5,0.041,24.0,160.0,0.99535,3.17,0.43,10.0,5
4,8.1,0.12,0.38,0.9,0.034,36.0,86.0,0.99026,2.8,0.55,12.0,6
5,6.6,0.2,0.38,7.9,0.052,30.0,145.0,0.9947,3.32,0.56,11.0,7
6,7.3,0.26,0.36,5.2,0.04,31.0,141.0,0.9931,3.16,0.59,11.0,6
7,6.9,0.32,0.17,7.6,0.042,69.0,219.0,0.9959,3.13,0.4,8.9,5
8,8.5,0.18,0.3,1.1,0.028,34.0,95.0,0.99272,2.83,0.36,10.0,4
9,7.2,0.27,0.28,15.2,0.046,6.0,41.0,0.99665,3.17,0.39,10.9,6


In [13]:
X = wine_df.copy()
X = X.drop("quality", axis=1)
y = wine_df["quality"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size=.6)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(798, 11)
(1198, 11)
(798,)
(1198,)


In [14]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# scaler = MinMaxScaler()
# X_scaler = scaler.fit(X_train)
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [16]:
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.5834724540901502


In [17]:
clf2 = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf2.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf2.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.5893155258764607


In [18]:
clf3 = RandomForestClassifier(random_state=1, n_estimators=25).fit(X_train_scaled, y_train)
print(f'Training Score: {clf3.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf3.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.5742904841402338


In [19]:
clf4 = RandomForestClassifier(random_state=1, n_estimators=15).fit(X_train_scaled, y_train)
print(f'Training Score: {clf4.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf4.score(X_test_scaled, y_test)}')

Training Score: 0.9962406015037594
Testing Score: 0.5734557595993323


In [20]:
clf5 = RandomForestClassifier(random_state=1, n_estimators=5).fit(X_train_scaled, y_train)
print(f'Training Score: {clf5.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf5.score(X_test_scaled, y_test)}')

Training Score: 0.9448621553884712
Testing Score: 0.5325542570951586


In [21]:
predictions = clf.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
display(cm)

array([[  0,   0,   0,   2,   0,   0,   0],
       [  0,   0,  11,  12,   0,   0,   0],
       [  0,   1, 206, 142,   3,   0,   0],
       [  0,   0, 102, 404,  30,   0,   0],
       [  0,   0,   5, 141,  88,   1,   0],
       [  0,   0,   1,  26,  19,   1,   0],
       [  0,   0,   0,   2,   1,   0,   0]])