In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score,precision_score,accuracy_score,confusion_matrix,roc_auc_score,roc_curve,f1_score,classification_report


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_csv("healthcare-dataset-stroke-data.csv")
df.shape
df.columns
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


(5110, 12)

In [3]:
df["id"].value_counts()
## so we can drop id columns
df.drop(columns=["id"],inplace=True)

In [4]:
y=df["stroke"]
df.drop(columns=["stroke"],inplace=True)

In [5]:
x_train,x_test,y_train,y_test=train_test_split(df,y,test_size=0.20,random_state=43)

In [6]:
## filling missing values
for col in x_train:
    if x_train[col].dtype=="int64" or x_train[col].dtype=="float64":
        x_train.fillna(x_train[col].mean(),inplace=True)
        x_test.fillna(x_train[col].mean(),inplace=True)

In [7]:
## scaling cont
cont=[i for i in x_train if x_train[i].dtype=="int64" or x_train[i].dtype=="float64"]
scaler=StandardScaler()
for col in cont:
    x_train[col]=scaler.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=scaler.transform(np.array(x_test[col]).reshape(-1,1))

In [8]:
## cat columns one hot encoding
cat=[i for i in x_train if x_train[i].dtype=="object"]
oe_train=pd.get_dummies(x_train[cat])
oe_test=pd.get_dummies(x_test[cat])

In [9]:
final_tn,final_ts=oe_train.align(oe_test,join="inner",axis=1)


In [10]:
final_train=pd.concat([x_train[cont],final_tn],axis=1)
final_test=pd.concat([x_test[cont],final_ts],axis=1)

In [11]:
## model building
leg=LogisticRegression()
leg.fit(final_train,y_train)
test_pred=leg.predict(final_test)

In [12]:
# test accuracy
accuracy_score(test_pred,y_test)

0.9559686888454012

In [13]:
# train accuracy
train_pred=leg.predict(final_train)
accuracy_score(train_pred,y_train)

0.9500978473581213

In [14]:
confusion_matrix(test_pred,y_test)

array([[976,  45],
       [  0,   1]], dtype=int64)

In [15]:
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1021
           1       0.02      1.00      0.04         1

    accuracy                           0.96      1022
   macro avg       0.51      0.98      0.51      1022
weighted avg       1.00      0.96      0.98      1022



In [16]:
recall_score(test_pred,y_test)

1.0

In [17]:
precision_score(test_pred,y_test)

0.021739130434782608

In [18]:
f1_score(test_pred,y_test)

0.042553191489361694

In [19]:
pos_lr=leg.predict_proba(final_test)[::,1]
roc_auc_score(y_test,pos_lr)

0.8656673200285104

In [20]:
## we are getting very low precission so will do with decission tree

In [21]:
dt=DecisionTreeClassifier()
dt.fit(final_train,y_train)
dt_test_pred=dt.predict(final_test)
accuracy_score(dt_test_pred,y_test)

0.9158512720156555

In [22]:
confusion_matrix(dt_test_pred,y_test)

array([[929,  39],
       [ 47,   7]], dtype=int64)

In [23]:
recall_score(dt_test_pred,y_test)

0.12962962962962962

In [24]:
precision_score(dt_test_pred,y_test)

0.15217391304347827

In [25]:
f1_score(dt_test_pred,y_test)


0.13999999999999999

In [26]:
pos_dt=dt.predict_proba(final_test)[::,1]
roc_auc_score(y_test,pos_dt)

0.5520090876692801

In [27]:
## here also we are getting very less recall and pre soo will go with random forest

In [28]:
rf=RandomForestClassifier()
rf.fit(final_train,y_train)

RandomForestClassifier()

In [29]:
rf_test=rf.predict(final_test)
accuracy_score(rf_test,y_test)

0.9559686888454012

In [30]:
confusion_matrix(rf_test,y_test)

array([[973,  42],
       [  3,   4]], dtype=int64)

In [31]:
rf_train=rf.predict(final_train)
accuracy_score(rf_train,y_train)

1.0

In [32]:
recall_score(rf_test,y_test)

0.5714285714285714

In [33]:
precision_score(rf_test,y_test)

0.08695652173913043

In [34]:
pos=rf.predict_proba(final_test)[::,1]
fpr,tpr,thres=roc_curve(y_test,pos)
roc_auc_score(y_test,pos)

0.8218215431218817

# imbalanced data dets sampling techniques

In [35]:
df=pd.read_csv("healthcare-dataset-stroke-data.csv")

In [36]:
df_class_0,df_class_1=y_test.value_counts()

In [37]:
x_train.shape,y_train.shape

((4088, 10), (4088,))

In [38]:
df_class_0=df[df["stroke"]==0]
df_class_1=df[df["stroke"]==1]

In [39]:
df_class_0.shape

(4861, 12)

In [40]:
df_class_1.shape

(249, 12)

# under sampling

In [41]:
df_under=df_class_0.sample(249)

In [42]:
#df_under

In [43]:
## now we have to concat both the data frames  row axis wise 
df_new_under=pd.concat([df_class_1,df_under],axis=0)

In [44]:
df_new_under["stroke"].value_counts()

1    249
0    249
Name: stroke, dtype: int64

In [45]:
# NOW WE CAN UPPLY ML MODELS TO GET CORRECT PREDICTIONS BUT IN THIS WE ARE LOOSING SOME MORE DATA SO WE HAVE ANOTHER MODEL BWLW

In [46]:
under_df=df_new_under.copy()
y_1=under_df["stroke"]
under_df.drop(columns=["id","stroke"],axis=1)


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
...,...,...,...,...,...,...,...,...,...,...
2304,Male,5.0,0,0,No,children,Urban,71.92,18.2,Unknown
722,Male,9.0,0,0,No,children,Rural,121.80,18.7,Unknown
3697,Female,59.0,1,0,Yes,Private,Rural,78.28,31.0,formerly smoked
3684,Female,7.0,0,0,No,children,Rural,89.38,19.0,Unknown


In [47]:
x_tr,x_ts,y_tr,y_ts=train_test_split(under_df,y_1,test_size=0.20,random_state=40)

In [48]:
## filling missing values
for col in x_tr:
    if x_tr[col].dtype=="int64" or x_tr[col].dtype=="float64":
        x_tr[col].fillna(x_tr[col].mean(),inplace=True)
        x_ts[col].fillna(x_tr[col].mean(),inplace=True)

In [49]:
## scaling cont
sc=StandardScaler()
cont=[i for i in x_tr if x_tr[i].dtype=="int64" or x_tr[i].dtype=="float64"]
for col in cont:
    x_tr[col]=sc.fit_transform(np.array(x_tr[col]).reshape(-1,1))
    x_ts[col]=sc.transform(np.array(x_ts[col]).reshape(-1,1))

In [50]:
## one hot encoding cat
cat=[i for i in x_tr if x_tr[i].dtype=="object"]
oe_tr=pd.get_dummies(x_tr[cat])
oe_ts=pd.get_dummies(x_ts[cat])


In [51]:
oe_trs,oe_tss=oe_tr.align(oe_ts,join="inner",axis=1)

In [52]:
finals_tr=pd.concat([x_tr[cont],oe_trs],axis=1)
finals_ts=pd.concat([x_ts[cont],oe_tss],axis=1)

In [53]:
## model building
lrs=LogisticRegression()
lrs.fit(finals_tr,y_tr)
lrs_test=lrs.predict(finals_ts)
confusion_matrix(lrs_test,y_ts)

array([[48,  0],
       [ 0, 52]], dtype=int64)

In [54]:
accuracy_score(lrs_test,y_ts)

1.0

In [55]:
precision_score(lrs_test,y_ts)

1.0

In [56]:
recall_score(lrs_test,y_ts)

1.0

In [57]:
f1_score(lrs_test,y_ts)

1.0

In [58]:
print(classification_report(lrs_test,y_ts))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        52

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



# OVER SAMPLING

In [59]:
df_count_0,df_count_1=df["stroke"].value_counts()

In [60]:
df_0=df[df["stroke"]==0]
df_1=df[df["stroke"]==1]

In [61]:
df_0.shape

(4861, 12)

In [62]:
df_1=df_1.sample(4861,replace=True)

In [63]:
df_1.shape,df_0.shape

((4861, 12), (4861, 12))

In [64]:
df_over=pd.concat([df_0,df_1],axis=0)

In [65]:
df_over
y_2=df_over["stroke"]
df_over.drop(columns=["id","stroke"],axis=1)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
249,Male,3.0,0,0,No,children,Rural,95.12,18.0,Unknown
250,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked
251,Female,8.0,0,0,No,Private,Urban,110.89,17.6,Unknown
252,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked
253,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,Unknown
...,...,...,...,...,...,...,...,...,...,...
230,Female,81.0,0,0,Yes,Self-employed,Rural,81.95,16.9,never smoked
110,Male,79.0,0,1,Yes,Private,Rural,129.98,22.6,formerly smoked
71,Female,67.0,1,0,Yes,Private,Rural,179.12,28.1,formerly smoked
29,Male,59.0,0,0,Yes,Private,Rural,211.78,,formerly smoked


In [66]:
x_tr,x_ts,y_tr,y_ts=train_test_split(df_over,y_2,test_size=0.20,random_state=34)

In [67]:
## filiing missing
for col in x_tr:
    if x_tr[col].dtype=="int64" or x_tr[col].dtype=="float64":
        x_tr.fillna(x_tr[col].mean(),inplace=True)
        x_ts.fillna(x_tr[col].mean(),inplace=True)

In [68]:
## scaling
slr=StandardScaler()
cont=[col for col in x_tr if x_tr[col].dtype=="int64" or x_tr[col].dtype=="float64"]
for col in cont:
    x_tr[col]=slr.fit_transform(np.array(x_tr[col]).reshape(-1,1))
    x_ts[col]=slr.transform(np.array(x_ts[col]).reshape(-1,1))

In [69]:
## one hot enc
cat=[i for i in x_tr if x_tr[i].dtype=="object"]
oet=pd.get_dummies(x_tr[cat])
oes=pd.get_dummies(x_ts[cat])

In [70]:
oess,oett=oet.align(oes,join="inner",axis=1)
finals_train=pd.concat([x_tr[cont],oess],axis=1)
finals_test=pd.concat([x_ts[cont],oett],axis=1)

In [71]:
## decission tree classifier
dts=DecisionTreeClassifier()
dts.fit(finals_train,y_tr)
dts_test_pred=dts.predict(finals_test)

In [72]:
## dts accuracy
accuracy_score(dts_test_pred,y_ts)

1.0

In [73]:
print(classification_report(dts_test_pred,y_ts))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1003
           1       1.00      1.00      1.00       942

    accuracy                           1.00      1945
   macro avg       1.00      1.00      1.00      1945
weighted avg       1.00      1.00      1.00      1945



In [74]:
confusion_matrix(dts_test_pred,y_ts)

array([[1003,    0],
       [   0,  942]], dtype=int64)

In [75]:
## over sampling is just copying sampls soo it is not perfect method so we have smote,it uses knn -k nearest algorith

# SMOTE- synthtic minority over sampling technique

In [76]:
pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in c:\users\govip\anaconda3\lib\site-packages (0.8.1)
Note: you may need to restart the kernel to use updated packages.


In [77]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [78]:
#from imblearn.over_sampling import SMOTE 
!pip install delayed



In [79]:
from imblearn.over_sampling import SMOTE

In [80]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

NameError: name 'X' is not defined

In [None]:
### will ask manohar , after this topic