In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

import numpy as np


# calculate accuracy measures and confusion matrix
from sklearn import metrics


from sklearn.metrics import recall_score

from imblearn.over_sampling import SMOTE

In [2]:
data=pd.read_csv('pima-indians-diabetes (4).csv')
data.head()

Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.tail()

Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [5]:
data.sample(5)

Unnamed: 0,Preg,Plas,Pres,skin,test,mass,pedi,age,class
522,6,114,0,0,0,0.0,0.189,26,0
687,1,107,50,19,0,28.3,0.181,29,0
150,1,136,74,50,204,37.4,0.399,24,0
527,3,116,74,15,105,26.3,0.107,24,0
616,6,117,96,0,0,28.7,0.157,30,0


In [7]:
data.shape

(768, 9)

In [8]:
data.dtypes

Preg       int64
Plas       int64
Pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Preg    768 non-null    int64  
 1   Plas    768 non-null    int64  
 2   Pres    768 non-null    int64  
 3   skin    768 non-null    int64  
 4   test    768 non-null    int64  
 5   mass    768 non-null    float64
 6   pedi    768 non-null    float64
 7   age     768 non-null    int64  
 8   class   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [10]:
data.isna().sum()

Preg     0
Plas     0
Pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [11]:
data.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Preg,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Plas,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
Pres,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
skin,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
test,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
mass,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
pedi,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
class,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [12]:
data['class']=data['class'].astype('category')

In [14]:
Data=data.copy()

In [15]:
from scipy.stats import  zscore

In [20]:
x=Data.drop('class',1)
y=Data[['class']]

In [21]:
scaled_x=x.apply(zscore)

In [24]:
x_train,x_test,y_train,y_test=train_test_split(scaled_x,y,test_size=0.3,random_state=1)

#### Over sampling

In [26]:
sm=SMOTE()
X_train_os,Y_train_os=sm.fit_resample(x_train,y_train)

In [27]:
print('Before data imbalance ',y_train.value_counts())
print('After data imbalance ',Y_train_os.value_counts())

Before data imbalance  class
0        354
1        183
dtype: int64
After data imbalance  class
0        354
1        354
dtype: int64


### under sampling

In [28]:
from imblearn.under_sampling import RandomUnderSampler

In [29]:
rus=RandomUnderSampler()
X_train_us,Y_train_us=rus.fit_resample(x_train,y_train)

In [30]:
print('Before data imbalance ',y_train.value_counts())
print('After data imbalance ',Y_train_us.value_counts())

Before data imbalance  class
0        354
1        183
dtype: int64
After data imbalance  class
0        183
1        183
dtype: int64


### Deleting nearest majority neighbors TomekLinks

In [31]:
from imblearn.under_sampling import TomekLinks  
# is removed instances of majority class

In [32]:
tl=TomekLinks()
X_train_tl,Y_train_tl=tl.fit_resample(x_train,y_train)
print('Before data imbalance ',y_train.value_counts())
print('After data imbalance ',Y_train_tl.value_counts())

Before data imbalance  class
0        354
1        183
dtype: int64
After data imbalance  class
0        319
1        183
dtype: int64


### Upsampling followed by downsampling

In [33]:
from imblearn.combine import SMOTETomek

In [39]:
smt = SMOTETomek()
X_train_smt,Y_train_smt=smt.fit_resample(x_train,y_train)
print('Before data imbalance ',y_train.value_counts())
print('After data imbalance ',Y_train_smt.value_counts())

Before data imbalance  class
0        354
1        183
dtype: int64
After data imbalance  class
0        341
1        341
dtype: int64


### Cluster based undersampling

In [37]:
from imblearn.under_sampling import ClusterCentroids

In [40]:
cc = ClusterCentroids()  
X_train_cc,Y_train_cc = cc.fit_resample(x_train, y_train)

In [41]:
print('Before data imbalance ',y_train.value_counts())
print('After data imbalance ',Y_train_cc.value_counts())

Before data imbalance  class
0        354
1        183
dtype: int64
After data imbalance  class
0        183
1        183
dtype: int64
