In [10]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 253 kB/s eta 0:00:01
Collecting scikit-learn>=0.23
  Downloading scikit_learn-0.23.1-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 5.2 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
Successfully installed imbalanced-learn-0.7.0 scikit-learn-0.23.1 threadpoolctl-2.1.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from random import randrange, choice
from sklearn.neighbors import NearestNeighbors

In [3]:
data = pd.read_csv("./hepatitis.csv")

In [4]:
data.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [5]:
data.shape

(142, 20)

In [6]:
data.columns

Index(['class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise',
       'anorexia', 'liver_big', 'liver_firm', 'spleen_palable', 'spiders',
       'ascites', 'varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin',
       'protime', 'histology'],
      dtype='object')

In [7]:
data.isnull().sum()

class             0
age               0
sex               0
steroid           0
antivirals        0
fatigue           0
malaise           0
anorexia          0
liver_big         0
liver_firm        0
spleen_palable    0
spiders           0
ascites           0
varices           0
bilirubin         0
alk_phosphate     0
sgot              0
albumin           0
protime           0
histology         0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,1.816901,40.816901,1.105634,1.514085,1.838028,1.359155,1.619718,1.795775,1.823944,1.584507,1.809859,1.669014,1.880282,1.880282,1.382958,105.647887,83.507042,3.830493,61.704225,1.443662
std,0.388116,12.189182,0.308456,0.501571,0.369729,0.481451,0.487174,0.404561,0.382216,0.494551,0.393801,0.472234,0.325781,0.325781,1.166526,47.375099,82.4177,0.618103,17.696732,0.498575
min,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.3,26.0,14.0,2.1,0.0,1.0
25%,2.0,32.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,0.7,78.5,32.25,3.5,56.25,1.0
50%,2.0,39.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,100.0,58.0,3.9,61.0,1.0
75%,2.0,50.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.48,119.75,99.5,4.2,66.0,2.0
max,2.0,78.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,8.0,295.0,648.0,6.4,100.0,2.0


In [9]:
data["class"].value_counts()

2    116
1     26
Name: class, dtype: int64

In [10]:
df = data.copy()
df_y = df['class']
df_x = df.drop('class', axis = 1)

In [11]:
df_y.shape, df_x.shape

((142,), (142, 19))

In [12]:
#Now to do the oversampling 
oversample = SMOTE()
df_x, df_y = oversample.fit_resample(df_x, df_y)

In [13]:
df_y.shape, df_x.shape

((232,), (232, 19))

In [14]:
df_x.head()

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [17]:
#Age, sex, fatigue, malaise, anorexia, liver problem, ascites, 
df = df_x.copy()
df = df[["age", "sex", "malaise", "anorexia", "liver_big", "liver_firm", "ascites", "histology"]]
df.head()

Unnamed: 0,age,sex,malaise,anorexia,liver_big,liver_firm,ascites,histology
0,30,2,2,2,1,2,2,1
1,50,1,2,2,1,2,2,1
2,78,1,2,2,2,2,2,1
3,34,1,2,2,2,2,2,1
4,34,1,2,2,2,2,2,1


In [19]:
#group them together for data generation
df["Y"] = df_y
df.head()

Unnamed: 0,age,sex,malaise,anorexia,liver_big,liver_firm,ascites,histology,Y
0,30,2,2,2,1,2,2,1,2
1,50,1,2,2,1,2,2,1,2
2,78,1,2,2,2,2,2,1,2
3,34,1,2,2,2,2,2,1,2
4,34,1,2,2,2,2,2,1,2


In [35]:
df['sex'].value_counts()

1    217
2     15
Name: sex, dtype: int64

In [38]:
df['malaise'].value_counts()

1    140
2     92
Name: malaise, dtype: int64

In [39]:
df["anorexia"].value_counts()

2    142
1     90
Name: anorexia, dtype: int64

In [40]:
df["liver_big"].value_counts()

2    195
1     37
Name: liver_big, dtype: int64

In [42]:
df["liver_firm"].value_counts()

2    123
1    109
Name: liver_firm, dtype: int64

In [43]:
df["ascites"].value_counts()

2    150
1     82
Name: ascites, dtype: int64

In [44]:
df["histology"].value_counts()

2    120
1    112
Name: histology, dtype: int64

In [23]:
## introducing the smote class

def SMOTE(T, N, k):
# """
# Returns (N/100) * n_minority_samples synthetic minority samples.
#
# Parameters
# ----------
# T : array-like, shape = [n_minority_samples, n_features]
#     Holds the minority samples
# N : percetange of new synthetic samples:
#     n_synthetic_samples = N/100 * n_minority_samples. Can be < 100.
# k : int. Number of nearest neighbours.
#
# Returns
# -------
# S : array, shape = [(N/100) * n_minority_samples, n_features]
# """
    n_minority_samples, n_features = T.shape

    if N < 100:
       #create synthetic samples only for a subset of T.
       #TODO: select random minortiy samples
       N = 100
       pass

    if (N % 100) != 0:
       raise ValueError("N must be < 100 or multiple of 100")

    N = N/100
    N = int(N)
    n_synthetic_samples = N * n_minority_samples
    n_synthetic_samples = int(n_synthetic_samples)
    n_features = int(n_features)
    S = np.zeros(shape=(n_synthetic_samples, n_features))

    #Learn nearest neighbours
    neigh = NearestNeighbors(n_neighbors = k)
    neigh.fit(T)

    #Calculate synthetic samples
    for i in range(n_minority_samples):
       nn = neigh.kneighbors(T[i].reshape(1, -1), return_distance=False)
       for n in range(N):
          nn_index = choice(nn[0])
          #NOTE: nn includes T[i], we don't want to select it
          while nn_index == i:
             nn_index = choice(nn[0])

          dif = T[nn_index] - T[i]
          gap = np.random.random()
          S[n + i * N, :] = T[i,:] + gap * dif[:]

    return S

df_np = df.to_numpy()
df_new = SMOTE(df_np, 500, 5)
df_new.shape

(1160, 9)

In [25]:
df_new = pd.DataFrame(df_new, columns=df.columns)

In [30]:
df.columns

Index(['age', 'sex', 'malaise', 'anorexia', 'liver_big', 'liver_firm',
       'ascites', 'histology', 'Y'],
      dtype='object')

In [29]:
df_new['Y'].value_counts()

1.000000    526
2.000000    493
1.081386      1
1.289870      1
1.964413      1
           ... 
1.498139      1
1.612399      1
1.800695      1
1.592410      1
1.353566      1
Name: Y, Length: 143, dtype: int64

In [31]:
df_new['sex'].value_counts()

1.000000    1027
2.000000       8
1.716974       1
1.662247       1
1.560444       1
            ... 
1.379579       1
1.864274       1
1.345696       1
1.858117       1
1.181549       1
Name: sex, Length: 127, dtype: int64

In [37]:
df_new['malaise'].value_counts()

1.000000    604
2.000000    354
1.372654      1
1.198625      1
1.890527      1
           ... 
1.151764      1
1.613093      1
1.450191      1
1.530737      1
1.817237      1
Name: malaise, Length: 204, dtype: int64

In [45]:
df_new["anorexia"].value_counts()

2.000000    599
1.000000    350
1.372654      1
1.779712      1
1.343122      1
           ... 
1.141883      1
1.969838      1
1.431699      1
1.151764      1
1.845303      1
Name: anorexia, Length: 213, dtype: int64

In [46]:
df_new["liver_big"].value_counts()

2.000000    924
1.000000     57
1.283026      1
1.906745      1
1.913375      1
           ... 
1.431699      1
1.572770      1
1.504890      1
1.101925      1
1.343604      1
Name: liver_big, Length: 181, dtype: int64

In [47]:
df_new["liver_firm"].value_counts()

2.000000    490
1.000000    356
1.913375      1
1.940675      1
1.135155      1
           ... 
1.620421      1
1.748059      1
1.045467      1
1.654304      1
1.641004      1
Name: liver_firm, Length: 316, dtype: int64

In [48]:
df_new["ascites"].value_counts()

2.000000    641
1.000000    339
1.681886      1
1.381504      1
1.168111      1
           ... 
1.426353      1
1.474748      1
1.470804      1
1.584590      1
1.660791      1
Name: ascites, Length: 182, dtype: int64

In [49]:
df_new["histology"].value_counts()

2.000000    468
1.000000    425
1.070413      1
1.204623      1
1.831889      1
           ... 
1.019251      1
1.381526      1
1.131670      1
1.295371      1
1.581623      1
Name: histology, Length: 269, dtype: int64

In [50]:
df_new.to_csv("hepatitis_new.csv")

In [16]:
# Bring in the data
df_new = pd.read_csv("hepatitis_new.csv")
df_new.shape

(1160, 10)

In [18]:
df_new.head()

Unnamed: 0.1,Unnamed: 0,age,sex,malaise,anorexia,liver_big,liver_firm,ascites,histology,Y
0,0,30.0,1.980432,2.0,2.0,1.019568,2.0,2.0,1.0,2.0
1,1,30.0,1.864274,2.0,2.0,1.135726,2.0,2.0,1.0,2.0
2,2,30.0,1.716974,2.0,2.0,1.283026,2.0,2.0,1.0,2.0
3,3,30.0,1.780971,2.0,2.0,1.219029,2.0,2.0,1.0,2.0
4,4,30.0,1.473866,2.0,2.0,1.526134,2.0,2.0,1.0,2.0


In [23]:
#edit the df_new Y file
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1
    
df_new["Y"] = df_new['Y'].apply(lambda x: adjust_1(x))
df_new["Y"].value_counts()

1    602
2    558
Name: Y, dtype: int64

In [25]:
def adjust_2(x):
    if x > 1.0:
        return 2
    else:
        return 1
    
df_new["sex"] = df_new["sex"].apply(lambda x: adjust_2(x))
df_new["sex"].value_counts()

1    1027
2     133
Name: sex, dtype: int64

In [26]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["malaise"] = df_new['malaise'].apply(lambda x: adjust_1(x))
df_new["malaise"].value_counts()


1    706
2    454
Name: malaise, dtype: int64

In [27]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["anorexia"] = df_new['anorexia'].apply(lambda x: adjust_1(x))
df_new["anorexia"].value_counts()


2    706
1    454
Name: anorexia, dtype: int64

In [28]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["liver_big"] = df_new['liver_big'].apply(lambda x: adjust_1(x))
df_new["liver_big"].value_counts()


2    1011
1     149
Name: liver_big, dtype: int64

In [29]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["liver_firm"] = df_new['liver_firm'].apply(lambda x: adjust_1(x))
df_new["liver_firm"].value_counts()


2    638
1    522
Name: liver_firm, dtype: int64

In [30]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["ascites"] = df_new['ascites'].apply(lambda x: adjust_1(x))
df_new["ascites"].value_counts()


2    723
1    437
Name: ascites, dtype: int64

In [31]:
def adjust_1(x):
    if x > 1.5:
        return 2
    else:
        return 1

df_new["histology"] = df_new['histology'].apply(lambda x: adjust_1(x))
df_new["histology"].value_counts()


2    604
1    556
Name: histology, dtype: int64

In [40]:
def adjust_age(x):
    return int(x)

df_new["age"] = df_new["age"].apply(lambda x: adjust_age(x))
list(df_new["age"])

[30,
 30,
 30,
 30,
 30,
 50,
 50,
 50,
 50,
 50,
 71,
 71,
 73,
 73,
 77,
 34,
 33,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 34,
 50,
 51,
 51,
 50,
 50,
 23,
 23,
 23,
 23,
 22,
 39,
 39,
 39,
 39,
 39,
 30,
 30,
 30,
 30,
 30,
 39,
 39,
 39,
 38,
 39,
 32,
 32,
 32,
 32,
 32,
 40,
 41,
 40,
 41,
 41,
 30,
 30,
 30,
 30,
 30,
 47,
 47,
 47,
 47,
 47,
 38,
 38,
 38,
 38,
 37,
 67,
 65,
 65,
 68,
 65,
 39,
 40,
 40,
 40,
 40,
 38,
 37,
 38,
 38,
 38,
 37,
 37,
 37,
 38,
 38,
 22,
 22,
 22,
 22,
 22,
 27,
 27,
 27,
 27,
 27,
 31,
 30,
 31,
 30,
 30,
 42,
 42,
 42,
 42,
 42,
 25,
 25,
 25,
 25,
 25,
 27,
 27,
 27,
 27,
 27,
 49,
 48,
 49,
 49,
 48,
 58,
 57,
 58,
 56,
 57,
 60,
 61,
 61,
 60,
 60,
 51,
 51,
 50,
 50,
 51,
 39,
 38,
 39,
 39,
 38,
 41,
 41,
 41,
 41,
 41,
 25,
 25,
 26,
 26,
 26,
 34,
 34,
 34,
 34,
 34,
 36,
 37,
 37,
 36,
 36,
 23,
 23,
 23,
 22,
 23,
 20,
 21,
 20,
 20,
 20,
 42,
 42,
 42,
 41,
 42,
 62,
 63,
 64,
 64,
 63,
 52,
 51,
 51,
 52,
 52,
 32,
 33,
 33,
 32,
 32,


In [41]:
df_new.head()

Unnamed: 0,age,sex,malaise,anorexia,liver_big,liver_firm,ascites,histology,Y
0,30,2,2,2,1,2,2,1,2
1,30,2,2,2,1,2,2,1,2
2,30,2,2,2,1,2,2,1,2
3,30,2,2,2,1,2,2,1,2
4,30,2,2,2,2,2,2,1,2


In [36]:
df_new.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [38]:
df_new.shape

(1160, 9)

In [42]:
df_new.to_csv("hepatitis_new_2.csv")