In [None]:
import pandas as pd
import numpy as np

In [None]:
df_red = pd.read_csv("w_red_train.csv")

#### Divide the dataset into Majority and Minority Class
  - we assumed population was not obtained from the real-world
  - For this reason, Divide the dataset into major and minor classes to hide subset of the majority class



In [None]:
columns = list(df_red.columns)

df_major_red = df_red[(df_red['quality'] == 5) | (df_red['quality'] == 6)]
df_minor_red = df_red[(df_red['quality'] == 3) | (df_red['quality'] == 4) | (df_red['quality'] == 7) | (df_red['quality'] == 8)]

We employed varied numbers of hidden examples,10%, 25%, and 50% of the majority examples <br>
to see the effect of the number of hidden examples on the validation process.


#### Test Case1. 50% Majority class hidden


Hide 50% of Majority class

In [None]:
df_major_50_red = df_major_red.sample(frac = 0.5)

columns = list(df_red.columns)

if 'type' in columns:
  columns.remove('type')

if 'quality' in columns:
  columns.remove('quality')

df_test1_50 = pd.concat([df_major_50_red, df_minor_red], axis = 0)

X_red_50 = df_test1_50[columns]
y_red_50 = df_test1_50['quality']

In [None]:
np.unique(y_red_50, return_counts = True)

(array([3, 4, 5, 6, 7, 8]), array([  6,  34, 214, 215, 129,  12]))

#### Oversampling the dataset

We used 4 type Methods of oversampling

1.   SMOTE
2.   ADASYN
3.   SMOTE Tomek
4.   Boderline SMOTE



In [None]:
### Oversampling
up_target = {3: 1200, 4:1200, 7:1200, 8:1200}  ### Using sampling_strategy for generating sufficient minority class


# 1. SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = up_target, k_neighbors = 2, random_state = 0)
X_red_50_sm, y_red_50_sm = sm.fit_resample(X_red_50, y_red_50)
df_red_50_sm = X_red_50_sm.copy()
df_red_50_sm['quality'] = y_red_50_sm

# 2. ADASYN
from imblearn.over_sampling import ADASYN

ads = ADASYN(sampling_strategy = up_target, random_state = 0)
X_red_50_ads, y_red_50_ads = ads.fit_resample(X_red_50, y_red_50)
df_red_50_ads = X_red_50_ads.copy()
df_red_50_ads['quality'] = y_red_50_ads

# 3. SMOTETomek
from imblearn.combine import SMOTETomek

smt = SMOTETomek(smote = sm, random_state = 0)
X_red_50_smt, y_red_50_smt = smt.fit_resample(X_red_50, y_red_50)
df_red_50_smt = X_red_50_smt.copy()
df_red_50_smt['quality'] = y_red_50_smt

# 4. Borderline SMOTE
from imblearn.over_sampling import BorderlineSMOTE

bsm = BorderlineSMOTE(sampling_strategy = up_target, random_state = 0)
X_red_50_bsm, y_red_50_bsm = bsm.fit_resample(X_red_50, y_red_50)
df_red_50_bsm = X_red_50_bsm.copy()
df_red_50_bsm['quality'] = y_red_50_bsm



#### Extract newly created  minority class to prevent duplicate

In [None]:
def Extract_New_Samples(df_up, df_org):
  # Step1. Extract newly oversampled minorioty class samples from df_up
  X_up_list = []
  X_org_list = []

  for i in range(len(df_up)):
    X_up_list.append(df_up.iloc[i])
  
  for i in range(len(df_org)):
    X_org_list.append(df_org.iloc[i])

  columns = df_org.columns
  df_new = pd.DataFrame(columns = columns)
  print(df_new)
  idx = 0

  for i in X_up_list:
    trig = True
    #print(list(i))
    for j in X_org_list:
      if list(i) == list(j):
        trig = False
    
    if trig == True:
      df_new.loc[idx] = i
      idx += 1

  return df_new

In [None]:
test1_df_up = [df_red_50_sm, df_red_50_ads, df_red_50_smt, df_red_50_bsm]
test1_new_sample = []

for df in test1_df_up:
  temp_df = Extract_New_Samples(df, df_test1_50)
  test1_new_sample.append(temp_df)

Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []


#### Define Validation System


Define our validation system checking the similiality with Hassanat Distance <br> 
And then generate new columns 'closest_class' to compare generated class with nearest class

In [None]:
### Validate
from tqdm import tqdm

def HasD(x, y):
  total = 0
  for xi, yi in zip(x, y):
    min_value = min(xi, yi)
    max_value = max(xi, yi)
    total += 1 # we sum the 1 in both cases
    if min_value >= 0:
      total -= (1 + min_value) / (1 + max_value)
    else:
      # min_value + abs(min_value) = 0, so we ignore that
      total -= 1 / (1 + max_value + abs(min_value))
  return total


for idx in range(len(test1_new_sample)):
  closest_class = []

  for i in tqdm(range(len(test1_new_sample[idx]))):
    temp_min = float("inf")
    temp_class = None

    for j in range(len(df_red)):
      temp_dist = HasD(list(df_red.loc[j][:-1]), list(test1_new_sample[idx].loc[i][:-1]))

      if temp_min > temp_dist:
        temp_min = temp_dist
        temp_class = df_red.loc[j][-1]
    
    closest_class.append(temp_class)
  
  test1_new_sample[idx]['closest_class'] = closest_class

100%|██████████| 4511/4511 [20:31<00:00,  3.66it/s]
100%|██████████| 4578/4578 [21:20<00:00,  3.58it/s]
100%|██████████| 4509/4509 [21:11<00:00,  3.55it/s]
100%|██████████| 2191/2191 [10:15<00:00,  3.56it/s]


 **Validation Result with 50% hidden majority** <br>



In [None]:
up_order = ['SMOTE', 'ADASYN', 'SMOTE Tomek', 'Borderline SMOTE']

for idx, df in enumerate(test1_new_sample):
  new_correct = len(df[df['quality'] == df['closest_class']])
  new_wrong = len(df[df['quality'] != df['closest_class']])

  print("{} oversampling's correct samples:{} wrong samples:{}".format(up_order[idx], new_correct, new_wrong))


SMOTE oversampling's correct samples:3376 wrong samples:1135
ADASYN oversampling's correct samples:3091 wrong samples:1487
SMOTE Tomek oversampling's correct samples:3375 wrong samples:1134
Borderline SMOTE oversampling's correct samples:1962 wrong samples:229


In [None]:
import pickle

with open('red_50_hidden_221130.pkl','wb') as f:
  pickle.dump(test1_new_sample,f)

#### Test Case2. 25% Majority class hidden

Hide 25% of Majority class

In [None]:
df_major_25_red = df_major_red.sample(frac = 0.75)

columns = list(df_red.columns)

if 'type' in columns:
  columns.remove('type')

if 'quality' in columns:
  columns.remove('quality')

df_test2_25 = pd.concat([df_major_25_red, df_minor_red], axis = 0)

X_red_25 = df_test2_25[columns]
y_red_25 = df_test2_25['quality']

In [None]:
np.unique(y_red_25, return_counts = True)

(array([3, 4, 5, 6, 7, 8]), array([  6,  34, 332, 312, 129,  12]))

Oversampling the dataset with 4 type methods

In [None]:
### Oversampling
up_target = {3: 1200, 4:1200, 7:1200, 8:1200}

# 1. SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = up_target, k_neighbors = 2, random_state = 0)
X_red_25_sm, y_red_25_sm = sm.fit_resample(X_red_25, y_red_25)
df_red_25_sm = X_red_25_sm.copy()
df_red_25_sm['quality'] = y_red_25_sm

# 2. ADASYN
from imblearn.over_sampling import ADASYN

ads = ADASYN(sampling_strategy = up_target, random_state = 0)
X_red_25_ads, y_red_25_ads = ads.fit_resample(X_red_25, y_red_25)
df_red_25_ads = X_red_25_ads.copy()
df_red_25_ads['quality'] = y_red_25_ads

# 3. SMOTETomek
from imblearn.combine import SMOTETomek

smt = SMOTETomek(smote = sm, random_state = 0)
X_red_25_smt, y_red_25_smt = smt.fit_resample(X_red_25, y_red_25)
df_red_25_smt = X_red_25_smt.copy()
df_red_25_smt['quality'] = y_red_25_smt

# 4. Borderline SMOTE
from imblearn.over_sampling import BorderlineSMOTE

bsm = BorderlineSMOTE(sampling_strategy = up_target, random_state = 0)
X_red_25_bsm, y_red_25_bsm = bsm.fit_resample(X_red_25, y_red_25)
df_red_25_bsm = X_red_25_bsm.copy()
df_red_25_bsm['quality'] = y_red_25_bsm



Extract newly created  minority class in 25% majority class hidden

In [None]:
test2_df_up = [df_red_25_sm, df_red_25_ads, df_red_25_smt, df_red_25_bsm]
test2_new_sample = []

for df in test2_df_up:
  temp_df = Extract_New_Samples(df, df_test2_25)
  test2_new_sample.append(temp_df)

Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []


Apply the new dataset to the validation system

In [None]:
for idx in range(len(test2_new_sample)):
  closest_class = []

  for i in tqdm(range(len(test2_new_sample[idx]))):
    temp_min = float("inf")
    temp_class = None

    for j in range(len(df_red)):
      temp_dist = HasD(list(df_red.loc[j][:-1]), list(test2_new_sample[idx].loc[i][:-1]))

      if temp_min > temp_dist:
        temp_min = temp_dist
        temp_class = df_red.loc[j][-1]
    
    closest_class.append(temp_class)
  
  test2_new_sample[idx]['closest_class'] = closest_class

100%|██████████| 4511/4511 [20:56<00:00,  3.59it/s]
100%|██████████| 4559/4559 [21:18<00:00,  3.56it/s]
100%|██████████| 4509/4509 [20:59<00:00,  3.58it/s]
100%|██████████| 2189/2189 [10:09<00:00,  3.59it/s]


**Validation Result with 25% hidden majority**

In [None]:
up_order = ['SMOTE', 'ADASYN', 'SMOTE Tomek', 'Borderline SMOTE']

for idx, df in enumerate(test2_new_sample):
  new_correct = len(df[df['quality'] == df['closest_class']])
  new_wrong = len(df[df['quality'] != df['closest_class']])

  print("{} oversampling's correct samples:{} wrong samples:{}".format(up_order[idx], new_correct, new_wrong))


SMOTE oversampling's correct samples:3376 wrong samples:1135
ADASYN oversampling's correct samples:3062 wrong samples:1497
SMOTE Tomek oversampling's correct samples:3376 wrong samples:1133
Borderline SMOTE oversampling's correct samples:1971 wrong samples:218


In [None]:
import pickle

with open('red_25_hidden_221130.pkl','wb') as f:
  pickle.dump(test2_new_sample,f)

#### Test Case3. 10% Majority class hidden

Hide 10% of Majority class

In [None]:
df_major_10_red = df_major_red.sample(frac = 0.9)

columns = list(df_red.columns)

if 'type' in columns:
  columns.remove('type')

if 'quality' in columns:
  columns.remove('quality')

df_test3_10 = pd.concat([df_major_10_red, df_minor_red], axis = 0)

X_red_10 = df_test3_10[columns]
y_red_10 = df_test3_10['quality']

In [None]:
np.unique(y_red_10, return_counts = True)

(array([3, 4, 5, 6, 7, 8]), array([  6,  34, 399, 373, 129,  12]))

Oversampling the dataset with 4 type methods

In [None]:
### Oversampling
up_target = {3: 1200, 4:1200, 7:1200, 8:1200}

# 1. SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = up_target, k_neighbors = 2, random_state = 0)
X_red_10_sm, y_red_10_sm = sm.fit_resample(X_red_10, y_red_10)
df_red_10_sm = X_red_10_sm.copy()
df_red_10_sm['quality'] = y_red_10_sm

# 2. ADASYN
from imblearn.over_sampling import ADASYN

ads = ADASYN(sampling_strategy = up_target, random_state = 0)
X_red_10_ads, y_red_10_ads = ads.fit_resample(X_red_10, y_red_10)
df_red_10_ads = X_red_10_ads.copy()
df_red_10_ads['quality'] = y_red_10_ads

# 3. SMOTETomek
from imblearn.combine import SMOTETomek

smt = SMOTETomek(smote = sm, random_state = 0)
X_red_10_smt, y_red_10_smt = smt.fit_resample(X_red_10, y_red_10)
df_red_10_smt = X_red_10_smt.copy()
df_red_10_smt['quality'] = y_red_10_smt

# 4. Borderline SMOTE
from imblearn.over_sampling import BorderlineSMOTE

bsm = BorderlineSMOTE(sampling_strategy = up_target, random_state = 0)
X_red_10_bsm, y_red_10_bsm = bsm.fit_resample(X_red_10, y_red_10)
df_red_10_bsm = X_red_10_bsm.copy()
df_red_10_bsm['quality'] = y_red_10_bsm



Extract newly created  minority class in 10% majority class hidden

In [None]:
test3_df_up = [df_red_10_sm, df_red_10_ads, df_red_10_smt, df_red_10_bsm]
test3_new_sample = []

for df in test3_df_up:
  temp_df = Extract_New_Samples(df, df_test3_10)
  test3_new_sample.append(temp_df)

Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []
Empty DataFrame
Columns: [fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, quality]
Index: []


Apply the new dataset to the validation system

In [None]:
for idx in range(len(test3_new_sample)):
  closest_class = []

  for i in tqdm(range(len(test3_new_sample[idx]))):
    temp_min = float("inf")
    temp_class = None

    for j in range(len(df_red)):
      temp_dist = HasD(list(df_red.loc[j][:-1]), list(test3_new_sample[idx].loc[i][:-1]))

      if temp_min > temp_dist:
        temp_min = temp_dist
        temp_class = df_red.loc[j][-1]
    
    closest_class.append(temp_class)
  
  test3_new_sample[idx]['closest_class'] = closest_class

100%|██████████| 4511/4511 [20:53<00:00,  3.60it/s]
100%|██████████| 4590/4590 [21:21<00:00,  3.58it/s]
100%|██████████| 4510/4510 [20:59<00:00,  3.58it/s]
100%|██████████| 2188/2188 [10:10<00:00,  3.58it/s]


**Validation Result with 25% hidden majority**

In [None]:
up_order = ['SMOTE', 'ADASYN', 'SMOTE Tomek', 'Borderline SMOTE']

for idx, df in enumerate(test3_new_sample):
  new_correct = len(df[df['quality'] == df['closest_class']])
  new_wrong = len(df[df['quality'] != df['closest_class']])

  print("{} oversampling's correct samples:{} wrong samples:{}".format(up_order[idx], new_correct, new_wrong))


SMOTE oversampling's correct samples:3376 wrong samples:1135
ADASYN oversampling's correct samples:3104 wrong samples:1486
SMOTE Tomek oversampling's correct samples:3375 wrong samples:1135
Borderline SMOTE oversampling's correct samples:1991 wrong samples:197


In [None]:
import pickle

with open('red_10_hidden_221130.pkl','wb') as f:
  pickle.dump(test3_new_sample,f)