In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
dataset = pd.read_csv('SmoteSample.csv')
dataset.shape

(100, 6)

In [5]:
dataset.head(3)

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Class
0,yes,11,14,no,40,A
1,yes,12,15,no,41,B
2,yes,13,16,yes,42,A


In [6]:
dataset.Class.unique()

array(['A', 'B', 'C', 'D'], dtype=object)

In [7]:
dataset.dtypes

Column1    object
Column2     int64
Column3     int64
Column4    object
Column5     int64
Class      object
dtype: object

In [8]:
sel_features = ['Column1', 'Column4']

In [9]:
print(dataset.shape)

# keeping the columns we need 
raw_data = dataset[sel_features]
print(raw_data.shape)

# Making categorical variables into numeric representation
new_raw_data = pd.get_dummies(raw_data, columns = sel_features)

# print the shape
print(new_raw_data.shape)
new_raw_data

(100, 6)
(100, 2)
(100, 4)


Unnamed: 0,Column1_no,Column1_yes,Column4_no,Column4_yes
0,0,1,1,0
1,0,1,1,0
2,0,1,0,1
3,1,0,1,0
4,1,0,0,1
...,...,...,...,...
95,0,1,1,0
96,0,1,0,1
97,1,0,1,0
98,1,0,0,1


In [10]:
result = pd.concat([dataset, new_raw_data], axis=1)
result

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,yes,11,14,no,40,A,0,1,1,0
1,yes,12,15,no,41,B,0,1,1,0
2,yes,13,16,yes,42,A,0,1,0,1
3,no,14,11,no,43,C,1,0,1,0
4,no,15,12,yes,44,D,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
95,yes,12,15,no,41,C,0,1,1,0
96,yes,13,16,yes,42,D,0,1,0,1
97,no,16,13,no,50,C,1,0,1,0
98,no,17,14,yes,51,D,1,0,0,1


In [11]:
data1 = result.drop(['Column1', 'Column4'], axis=1)
data1

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,11,14,40,A,0,1,1,0
1,12,15,41,B,0,1,1,0
2,13,16,42,A,0,1,0,1
3,14,11,43,C,1,0,1,0
4,15,12,44,D,1,0,0,1
...,...,...,...,...,...,...,...,...
95,12,15,41,C,0,1,1,0
96,13,16,42,D,0,1,0,1
97,16,13,50,C,1,0,1,0
98,17,14,51,D,1,0,0,1


In [12]:
x = data1.drop(['Class'], axis=1)
x

Unnamed: 0,Column2,Column3,Column5,Column1_no,Column1_yes,Column4_no,Column4_yes
0,11,14,40,0,1,1,0
1,12,15,41,0,1,1,0
2,13,16,42,0,1,0,1
3,14,11,43,1,0,1,0
4,15,12,44,1,0,0,1
...,...,...,...,...,...,...,...
95,12,15,41,0,1,1,0
96,13,16,42,0,1,0,1
97,16,13,50,1,0,1,0
98,17,14,51,1,0,0,1


In [13]:
x.dtypes

Column2        int64
Column3        int64
Column5        int64
Column1_no     uint8
Column1_yes    uint8
Column4_no     uint8
Column4_yes    uint8
dtype: object

In [14]:
x = x.astype('int64')
x

Unnamed: 0,Column2,Column3,Column5,Column1_no,Column1_yes,Column4_no,Column4_yes
0,11,14,40,0,1,1,0
1,12,15,41,0,1,1,0
2,13,16,42,0,1,0,1
3,14,11,43,1,0,1,0
4,15,12,44,1,0,0,1
...,...,...,...,...,...,...,...
95,12,15,41,0,1,1,0
96,13,16,42,0,1,0,1
97,16,13,50,1,0,1,0
98,17,14,51,1,0,0,1


In [15]:
y = data1['Class']
y = y.values
y

array(['A', 'B', 'A', 'C', 'D', 'C', 'D', 'A', 'C', 'A', 'C', 'A', 'B',
       'C', 'A', 'C', 'A', 'C', 'A', 'C', 'A', 'A', 'C', 'A', 'C', 'A',
       'C', 'C', 'C', 'C', 'A', 'C', 'A', 'C', 'A', 'C', 'A', 'A', 'C',
       'A', 'A', 'C', 'A', 'C', 'A', 'C', 'C', 'C', 'C', 'A', 'C', 'C',
       'A', 'C', 'A', 'A', 'C', 'A', 'C', 'A', 'A', 'A', 'C', 'A', 'C',
       'A', 'A', 'C', 'A', 'C', 'A', 'C', 'A', 'A', 'C', 'A', 'C', 'A',
       'C', 'A', 'A', 'C', 'A', 'C', 'A', 'A', 'C', 'D', 'C', 'A', 'B',
       'A', 'C', 'D', 'A', 'C', 'D', 'C', 'D', 'C'], dtype=object)

In [16]:
print(x.shape)
print(y.shape)

(100, 7)
(100,)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [18]:
print(np.unique(y_train))
print(np.unique(y_test))

['A' 'B' 'C' 'D']
['A' 'C' 'D']


In [19]:
data1['Class'].value_counts()

A    46
C    45
D     6
B     3
Name: Class, dtype: int64

In [20]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
print('RF Accuracy: %.4f' % (acc))

RF Accuracy: 0.8333


In [21]:
print("RF " + accuracy_score(y_test, pred).astype(str))

RF 0.8333333333333334


In [22]:
print(confusion_matrix(y_test, pred))

[[ 8  1  0]
 [ 2 17  0]
 [ 2  0  0]]


In [23]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           A       0.67      0.89      0.76         9
           C       0.94      0.89      0.92        19
           D       0.00      0.00      0.00         2

    accuracy                           0.83        30
   macro avg       0.54      0.59      0.56        30
weighted avg       0.80      0.83      0.81        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
data2 = data1.copy()

In [25]:
print(data1['Class'].unique())
print(len(data1['Class'].unique()))

x1 = data1.drop(['Class'], axis=1)
x1 = x1.astype('int64')
x1 = x1.values
print(x1)

y1 = data1['Class']
y1 = y1.values
print(y1)

['A' 'B' 'C' 'D']
4
[[11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1 

In [26]:
print(data2['Class'].value_counts())
print(len(data2['Class'].value_counts()))

A    46
C    45
D     6
B     3
Name: Class, dtype: int64
4


In [27]:
data2.drop(data2[data2['Class'] == 'B'].index, inplace = True) 
#data2.drop(data2[data2['Class'] == 'D'].index, inplace = True) 
print(data2['Class'].unique())
print(len(data2['Class'].unique()))

x2 = data2.drop(['Class'], axis=1)
x2 = x2.astype('int64')
x2 = x2.values
print(x2)

y2 = data2['Class']
y2 = y2.values
print(y2)

['A' 'C' 'D']
3
[[11 14 40  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]
 [14 11 43  1  0  1  0]
 [15 12 44  1  0  0  1]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [16 13 50  1  0  1  0]
 [17 14 51  1  0  0  1]
 [11 14 40  0  1  1  0]
 [12 15 41  0  1  1  0]
 [13 16 42  0  1  0  1]


In [28]:
print(data1.shape)
print(data2.shape)

(100, 8)
(97, 8)


In [29]:
print(x1.shape)
print(y1.shape)

(100, 7)
(100,)


In [30]:
print(x2.shape)
print(y2.shape)

(97, 7)
(97,)


In [31]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.3, random_state=0)

In [32]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.3, random_state=0)

In [33]:
from imblearn.over_sampling import SMOTE 
smote = SMOTE()
from collections import Counter

In [34]:
#smote=SMOTE(sampling_strategy='not minority',random_state=10)

In [35]:
X_S1, y_S1 = smote.fit_resample(x1,y1)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6

In [None]:
print("Before Smote :", Counter(y1))
print("After Smote :", Counter(y_S1))

In [36]:
X_S2, y_S2 = smote.fit_resample(x2,y2)

In [37]:
print("Before Smote :", Counter(y2))
print("After Smote :", Counter(y_S2))

Before Smote : Counter({'A': 46, 'C': 45, 'D': 6})
After Smote : Counter({'A': 46, 'C': 46, 'D': 46})


In [38]:
X_S11, y_S11 = smote.fit_resample(X_train1,y_train1)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6

In [39]:
print("Before Smote :", Counter(y_train1))
print("After Smote :", Counter(y_S11))

Before Smote : Counter({'A': 37, 'C': 26, 'D': 4, 'B': 3})


NameError: name 'y_S11' is not defined

In [40]:
X_S22, y_S22 = smote.fit_resample(X_train2,y_train2)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 5, n_neighbors = 6

In [41]:
print("Before Smote :", Counter(y_train2))
print("After Smote :", Counter(y_S22))

Before Smote : Counter({'A': 34, 'C': 28, 'D': 5})


NameError: name 'y_S22' is not defined

In [42]:
print(X_S22.shape)
print(y_S22.shape)

NameError: name 'X_S22' is not defined

In [43]:
print(X_S2.shape)
print(y_S2.shape)

(138, 7)
(138,)


In [44]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_S2, y_S2, test_size=0.3, random_state=0)

In [45]:
#Use GAN model to increase the instances of minority class. Increase there rows to atleast 6 instances.

In [46]:
data3 = data1.copy()

In [47]:
data3.drop(data3[data3['Class'] == 'A'].index, inplace = True) 
data3.drop(data3[data3['Class'] == 'C'].index, inplace = True) 
data3

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
1,12,15,41,B,0,1,1,0
4,15,12,44,D,1,0,0,1
6,17,14,51,D,1,0,0,1
12,16,13,50,B,1,0,1,0
87,13,16,42,D,0,1,0,1
90,16,13,50,B,1,0,1,0
93,17,14,51,D,1,0,0,1
96,13,16,42,D,0,1,0,1
98,17,14,51,D,1,0,0,1


In [48]:
new_row = {'Column2': 12, 'Column3':17, 'Column5':92, 'Class':'D', 'Column1_no' : 0,  'Column1_yes': 1, 'Column4_no':1, \
           "Column4_yes":0}
new_row

{'Column2': 12,
 'Column3': 17,
 'Column5': 92,
 'Class': 'D',
 'Column1_no': 0,
 'Column1_yes': 1,
 'Column4_no': 1,
 'Column4_yes': 0}

In [49]:
data3 = data3.append(new_row, ignore_index=True)
data3

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,12,15,41,B,0,1,1,0
1,15,12,44,D,1,0,0,1
2,17,14,51,D,1,0,0,1
3,16,13,50,B,1,0,1,0
4,13,16,42,D,0,1,0,1
5,16,13,50,B,1,0,1,0
6,17,14,51,D,1,0,0,1
7,13,16,42,D,0,1,0,1
8,17,14,51,D,1,0,0,1
9,12,17,92,D,0,1,1,0


In [50]:
discrete_columns = ['Class']

In [51]:
from ctgan import CTGANSynthesizer
ctgan = CTGANSynthesizer(epochs=1)
ctgan.fit(data3, discrete_columns)

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


In [52]:
samples = ctgan.sample(400)
samples

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,16,16,37,B,1,0,0,0
1,11,11,107,D,0,0,0,1
2,18,13,54,B,0,1,0,1
3,18,11,53,B,0,0,0,0
4,10,14,49,D,0,0,1,0
...,...,...,...,...,...,...,...,...
395,14,12,102,B,0,0,0,0
396,17,11,55,D,0,0,1,1
397,19,13,48,D,0,0,0,0
398,12,12,98,D,0,0,0,0


In [53]:
samples['Class'].value_counts()

B    205
D    195
Name: Class, dtype: int64

In [54]:
data4 = data1.copy()

In [55]:
data4.drop(data4[data4['Class'] == 'B'].index, inplace = True) 
data4.drop(data4[data4['Class'] == 'D'].index, inplace = True) 
data4

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,11,14,40,A,0,1,1,0
2,13,16,42,A,0,1,0,1
3,14,11,43,C,1,0,1,0
5,16,13,50,C,1,0,1,0
7,11,14,40,A,0,1,1,0
...,...,...,...,...,...,...,...,...
92,16,13,50,C,1,0,1,0
94,11,14,40,A,0,1,1,0
95,12,15,41,C,0,1,1,0
97,16,13,50,C,1,0,1,0


In [56]:
data4['Class'].value_counts()

A    46
C    45
Name: Class, dtype: int64

In [57]:
frames = [samples, data4]

In [59]:
result = pd.concat(frames)
result = result.reset_index(drop=True)
result

Unnamed: 0,Column2,Column3,Column5,Class,Column1_no,Column1_yes,Column4_no,Column4_yes
0,16,16,37,B,1,0,0,0
1,11,11,107,D,0,0,0,1
2,18,13,54,B,0,1,0,1
3,18,11,53,B,0,0,0,0
4,10,14,49,D,0,0,1,0
...,...,...,...,...,...,...,...,...
486,16,13,50,C,1,0,1,0
487,11,14,40,A,0,1,1,0
488,12,15,41,C,0,1,1,0
489,16,13,50,C,1,0,1,0


In [60]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Column2      491 non-null    int64 
 1   Column3      491 non-null    int64 
 2   Column5      491 non-null    int64 
 3   Class        491 non-null    object
 4   Column1_no   491 non-null    int64 
 5   Column1_yes  491 non-null    int64 
 6   Column4_no   491 non-null    int64 
 7   Column4_yes  491 non-null    int64 
dtypes: int64(7), object(1)
memory usage: 30.8+ KB


In [61]:
result['Class'].value_counts()

B    205
D    195
A     46
C     45
Name: Class, dtype: int64