In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("w_red_train.csv")
df_test = pd.read_csv("w_red_test.csv")

#### Oversampling

In [3]:
np.unique(df_train['quality'], return_counts = True)

(array([3, 4, 5, 6, 7, 8], dtype=int64),
 array([  6,  34, 443, 415, 129,  12], dtype=int64))

In [4]:
columns = list(df_train.columns)

if 'quality' in columns:
  columns.remove('quality')
if 'type' in columns:
  columns.remove('type')

X_train = df_train[columns]
y_train = df_train['quality']

X_test = df_test[columns]
y_test = df_test['quality']

Using smapling strategy for generating minority class sufficiently

In [5]:
### Oversampling
up_target = {3: 1200, 4:1200, 7:1200, 8:1200}

# 1. SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = up_target, k_neighbors = 2, random_state = 0)
X_red_sm, y_red_sm = sm.fit_resample(X_train, y_train)
df_red_sm = X_red_sm.copy()
df_red_sm['quality'] = y_red_sm

# 2. ADASYN
from imblearn.over_sampling import ADASYN

ads = ADASYN(sampling_strategy = up_target, random_state = 0)
X_red_ads, y_red_ads = ads.fit_resample(X_train, y_train)
df_red_ads = X_red_ads.copy()
df_red_ads['quality'] = y_red_ads

# 3. SMOTETomek
from imblearn.combine import SMOTETomek

smt = SMOTETomek(smote = sm, random_state = 0)
X_red_smt, y_red_smt = smt.fit_resample(X_train, y_train)
df_red_smt = X_red_smt.copy()
df_red_smt['quality'] = y_red_smt

# 4. Borderline SMOTE
from imblearn.over_sampling import BorderlineSMOTE

bsm = BorderlineSMOTE(sampling_strategy = up_target, random_state = 0)
X_red_bsm, y_red_bsm = bsm.fit_resample(X_train, y_train)
df_red_bsm = X_red_bsm.copy()
df_red_bsm['quality'] = y_red_bsm



#### Extract Newly Oversampled Samples

In [6]:
def Extract_New_Samples(df_up, df_org):
  # Step1. Extract newly oversampled minorioty class samples from df_up
  X_up_list = []
  X_org_list = []

  for i in range(len(df_up)):
    X_up_list.append(df_up.iloc[i])
  
  for i in range(len(df_org)):
    X_org_list.append(df_org.iloc[i])

  columns = df_org.columns
  df_new = pd.DataFrame(columns = columns)
  print(df_new)
  idx = 0

  for i in X_up_list:
    trig = True
    #print(list(i))
    for j in X_org_list:
      if list(i) == list(j):
        trig = False
    
    if trig == True:
      df_new.loc[idx] = i
      idx += 1

  return df_new

In [7]:
list_df_up = [df_red_sm, df_red_ads, df_red_smt, df_red_bsm]
list_new_sample = []

for df in list_df_up:
  temp_df = Extract_New_Samples(df, df_train)
  list_new_sample.append(temp_df)

Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []


#### Classification and Result with SMOTE

Construct New training set by Combining newly oversampled

In [8]:
df_train_sm = pd.concat([df_train, list_new_sample[0].sample(n = 1479, random_state = 0)], axis = 0, ignore_index = True)

In [9]:
X_train = df_train_sm[columns]
y_train = df_train_sm['quality']

1. Random Forest

In [10]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train, y_train)

rf_y_pred = clf.predict(X_test)

Evaluate


In [11]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))

[[  1   1   2   0   0   0]
 [  2   3   8   5   1   0]
 [  3  12 186  35   2   0]
 [  0   6  59 128  28   2]
 [  0   0   2  20  45   3]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.17      0.25      0.20         4
           4       0.14      0.16      0.15        19
           5       0.72      0.78      0.75       238
           6       0.68      0.57      0.62       223
           7       0.55      0.64      0.59        70
           8       0.00      0.00      0.00         6

    accuracy                           0.65       560
   macro avg       0.38      0.40      0.39       560
weighted avg       0.65      0.65      0.65       560



2. GBM

In [12]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)
clf = clf.fit(X_train, y_train)

gbm_y_pred = clf.predict(X_test)

Evaluate


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, gbm_y_pred))
print(classification_report(y_test, gbm_y_pred))

[[  1   2   0   0   1   0]
 [  1   2  11   3   2   0]
 [  2  12 169  49   6   0]
 [  0   9  54 119  38   3]
 [  0   0   0  22  43   5]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.25      0.25      0.25         4
           4       0.08      0.11      0.09        19
           5       0.72      0.71      0.72       238
           6       0.62      0.53      0.57       223
           7       0.45      0.61      0.52        70
           8       0.00      0.00      0.00         6

    accuracy                           0.60       560
   macro avg       0.35      0.37      0.36       560
weighted avg       0.61      0.60      0.60       560



3. Logistic Regression

In [14]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 0)
clf = clf.fit(X_train, y_train)

lr_y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluate


In [15]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))

[[  1   1   1   1   0   0]
 [  4   5   4   1   4   1]
 [ 26  40 127  24  18   3]
 [ 24  30  48  41  54  26]
 [  4   0   4   9  38  15]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.02      0.25      0.03         4
           4       0.07      0.26      0.11        19
           5       0.69      0.53      0.60       238
           6       0.54      0.18      0.27       223
           7       0.32      0.54      0.40        70
           8       0.00      0.00      0.00         6

    accuracy                           0.38       560
   macro avg       0.27      0.30      0.24       560
weighted avg       0.55      0.38      0.42       560



4. KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

Evaluate


In [17]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, knn_y_pred))
print(classification_report(y_test, knn_y_pred))

[[  1   1   0   1   1   0]
 [  3   4   5   4   3   0]
 [ 10  29 130  43  19   7]
 [  9  25  63  70  36  20]
 [  1   6   7  11  38   7]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.04      0.25      0.07         4
           4       0.06      0.21      0.10        19
           5       0.63      0.55      0.59       238
           6       0.54      0.31      0.40       223
           7       0.37      0.54      0.44        70
           8       0.00      0.00      0.00         6

    accuracy                           0.43       560
   macro avg       0.27      0.31      0.27       560
weighted avg       0.53      0.43      0.47       560



#### Classification and Result with ADASYN

Construct New training set by Combining newly oversampled

In [18]:
df_train_ads = pd.concat([df_train, list_new_sample[1].sample(n = 1479, random_state = 0)], axis = 0, ignore_index = True)

In [19]:
X_train = df_train_ads[columns]
y_train = df_train_ads['quality']

1. Random Forest

In [20]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train, y_train)

rf_y_pred = clf.predict(X_test)

>  Evaluate


In [21]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))

[[  1   2   0   1   0   0]
 [  1   3   9   4   2   0]
 [  3  15 176  41   3   0]
 [  2   6  55 127  29   4]
 [  0   0   1  19  45   5]
 [  0   0   0   1   5   0]]
              precision    recall  f1-score   support

           3       0.14      0.25      0.18         4
           4       0.12      0.16      0.13        19
           5       0.73      0.74      0.73       238
           6       0.66      0.57      0.61       223
           7       0.54      0.64      0.58        70
           8       0.00      0.00      0.00         6

    accuracy                           0.63       560
   macro avg       0.36      0.39      0.37       560
weighted avg       0.64      0.63      0.63       560



2. GBM

In [22]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)
clf = clf.fit(X_train, y_train)

gbm_y_pred = clf.predict(X_test)

>  Evaluate


In [23]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, gbm_y_pred))
print(classification_report(y_test, gbm_y_pred))

[[  1   2   0   1   0   0]
 [  0   5   8   4   2   0]
 [  2  15 169  47   5   0]
 [  2   9  48 119  42   3]
 [  0   0   1  20  40   9]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.20      0.25      0.22         4
           4       0.16      0.26      0.20        19
           5       0.75      0.71      0.73       238
           6       0.62      0.53      0.57       223
           7       0.42      0.57      0.48        70
           8       0.00      0.00      0.00         6

    accuracy                           0.60       560
   macro avg       0.36      0.39      0.37       560
weighted avg       0.63      0.60      0.61       560



3. Logistic Regression

In [24]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 0)
clf = clf.fit(X_train, y_train)

lr_y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


>  Evaluate


In [25]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))

[[ 1  2  0  1  0  0]
 [ 5  6  2  2  2  2]
 [31 67 92 30 15  3]
 [25 46 28 55 43 26]
 [ 2  2  2 11 29 24]
 [ 0  0  0  0  5  1]]
              precision    recall  f1-score   support

           3       0.02      0.25      0.03         4
           4       0.05      0.32      0.08        19
           5       0.74      0.39      0.51       238
           6       0.56      0.25      0.34       223
           7       0.31      0.41      0.35        70
           8       0.02      0.17      0.03         6

    accuracy                           0.33       560
   macro avg       0.28      0.30      0.22       560
weighted avg       0.58      0.33      0.40       560



4. KNN

In [26]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

>  Evaluate


In [27]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, knn_y_pred))
print(classification_report(y_test, knn_y_pred))

[[  1   1   0   0   2   0]
 [  3   5   5   2   3   1]
 [ 14  51 115  39  13   6]
 [ 12  42  54  64  30  21]
 [  2   6   8  14  32   8]
 [  0   0   0   1   5   0]]
              precision    recall  f1-score   support

           3       0.03      0.25      0.06         4
           4       0.05      0.26      0.08        19
           5       0.63      0.48      0.55       238
           6       0.53      0.29      0.37       223
           7       0.38      0.46      0.41        70
           8       0.00      0.00      0.00         6

    accuracy                           0.39       560
   macro avg       0.27      0.29      0.24       560
weighted avg       0.53      0.39      0.44       560



#### Classification and Result with SMOTE Tomek

Construct New training set by Combining newly oversampled

In [28]:
df_train_smt = pd.concat([df_train, list_new_sample[2].sample(n = 1479, random_state = 0)], axis = 0, ignore_index = True)

In [29]:
X_train = df_train_smt[columns]
y_train = df_train_smt['quality']

1. Random Forest

In [30]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train, y_train)

rf_y_pred = clf.predict(X_test)

>  Evaluate


In [31]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))

[[  1   1   2   0   0   0]
 [  2   3  10   3   1   0]
 [  3  12 178  43   2   0]
 [  0   4  60 129  28   2]
 [  0   0   1  20  46   3]
 [  0   0   0   1   5   0]]
              precision    recall  f1-score   support

           3       0.17      0.25      0.20         4
           4       0.15      0.16      0.15        19
           5       0.71      0.75      0.73       238
           6       0.66      0.58      0.62       223
           7       0.56      0.66      0.61        70
           8       0.00      0.00      0.00         6

    accuracy                           0.64       560
   macro avg       0.37      0.40      0.38       560
weighted avg       0.64      0.64      0.64       560



2. GBM

In [32]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)
clf = clf.fit(X_train, y_train)

gbm_y_pred = clf.predict(X_test)

>  Evaluate


In [33]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, gbm_y_pred))
print(classification_report(y_test, gbm_y_pred))

[[  1   2   0   0   1   0]
 [  1   4  10   1   3   0]
 [  1  14 171  45   7   0]
 [  0   9  55 115  43   1]
 [  0   0   1  20  43   6]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.33      0.25      0.29         4
           4       0.14      0.21      0.17        19
           5       0.72      0.72      0.72       238
           6       0.64      0.52      0.57       223
           7       0.42      0.61      0.50        70
           8       0.00      0.00      0.00         6

    accuracy                           0.60       560
   macro avg       0.37      0.38      0.37       560
weighted avg       0.62      0.60      0.60       560



3. Logistic Regression

In [34]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 0)
clf = clf.fit(X_train, y_train)

lr_y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


>  Evaluate


In [35]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))

[[  1   2   0   1   0   0]
 [  3   8   3   1   3   1]
 [ 29  50 116  25  15   3]
 [ 27  34  40  50  47  25]
 [  4   1   2  10  37  16]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.02      0.25      0.03         4
           4       0.08      0.42      0.14        19
           5       0.72      0.49      0.58       238
           6       0.57      0.22      0.32       223
           7       0.34      0.53      0.42        70
           8       0.00      0.00      0.00         6

    accuracy                           0.38       560
   macro avg       0.29      0.32      0.25       560
weighted avg       0.58      0.38      0.43       560



4. KNN

In [36]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

>  Evaluate


In [37]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, knn_y_pred))
print(classification_report(y_test, knn_y_pred))

[[  1   1   0   0   2   0]
 [  3   4   5   4   2   1]
 [ 10  31 131  41  19   6]
 [ 10  27  63  68  35  20]
 [  1   6   6  15  35   7]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.04      0.25      0.07         4
           4       0.06      0.21      0.09        19
           5       0.64      0.55      0.59       238
           6       0.53      0.30      0.39       223
           7       0.35      0.50      0.41        70
           8       0.00      0.00      0.00         6

    accuracy                           0.43       560
   macro avg       0.27      0.30      0.26       560
weighted avg       0.53      0.43      0.46       560



#### Classification and Result with Boderline SMOTE

Construct New training set by Combining newly oversampled

In [38]:
df_train_bsm = pd.concat([df_train, list_new_sample[3].sample(n = 1479, random_state = 0)], axis = 0, ignore_index = True)

In [39]:
X_train = df_train_bsm[columns]
y_train = df_train_bsm['quality']

1. Random Forest

In [40]:
# RF
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 0)
clf.fit(X_train, y_train)

rf_y_pred = clf.predict(X_test)

>  Evaluate


In [41]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, rf_y_pred))
print(classification_report(y_test, rf_y_pred))

[[  0   1   2   1   0   0]
 [  0   3   8   7   1   0]
 [  0   9 190  33   6   0]
 [  0   6  57 120  40   0]
 [  0   0   2  17  51   0]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.16      0.16      0.16        19
           5       0.73      0.80      0.76       238
           6       0.67      0.54      0.60       223
           7       0.49      0.73      0.59        70
           8       0.00      0.00      0.00         6

    accuracy                           0.65       560
   macro avg       0.34      0.37      0.35       560
weighted avg       0.65      0.65      0.64       560



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2. GBM

In [42]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(random_state = 0)
clf = clf.fit(X_train, y_train)

gbm_y_pred = clf.predict(X_test)

>  Evaluate


In [43]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, gbm_y_pred))
print(classification_report(y_test, gbm_y_pred))

[[  0   2   1   1   0   0]
 [  0   4   9   5   1   0]
 [  2  20 155  55   6   0]
 [  0  11  49 112  48   3]
 [  0   0   2  18  46   4]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.11      0.21      0.14        19
           5       0.72      0.65      0.68       238
           6       0.59      0.50      0.54       223
           7       0.43      0.66      0.52        70
           8       0.00      0.00      0.00         6

    accuracy                           0.57       560
   macro avg       0.31      0.34      0.31       560
weighted avg       0.60      0.57      0.58       560



3. Logistic Regression

In [44]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state = 0)
clf = clf.fit(X_train, y_train)

lr_y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


>  Evaluate


In [45]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, lr_y_pred))
print(classification_report(y_test, lr_y_pred))

[[  0   3   0   1   0   0]
 [  0  13   2   1   3   0]
 [  0  87 108  14  29   0]
 [  0  63  36  24 100   0]
 [  0   3   4   2  61   0]
 [  0   0   0   0   6   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.08      0.68      0.14        19
           5       0.72      0.45      0.56       238
           6       0.57      0.11      0.18       223
           7       0.31      0.87      0.45        70
           8       0.00      0.00      0.00         6

    accuracy                           0.37       560
   macro avg       0.28      0.35      0.22       560
weighted avg       0.57      0.37      0.37       560



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


4. KNN

In [46]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_y_pred = knn.predict(X_test)

>  Evaluate


In [47]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, knn_y_pred))
print(classification_report(y_test, knn_y_pred))

[[  0   1   1   0   2   0]
 [  0   4   7   5   3   0]
 [  0  39 128  43  28   0]
 [  0  36  65  74  48   0]
 [  0   5   9  11  45   0]
 [  0   0   0   2   4   0]]
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.05      0.21      0.08        19
           5       0.61      0.54      0.57       238
           6       0.55      0.33      0.41       223
           7       0.35      0.64      0.45        70
           8       0.00      0.00      0.00         6

    accuracy                           0.45       560
   macro avg       0.26      0.29      0.25       560
weighted avg       0.52      0.45      0.47       560



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
