In [220]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.metrics import recall_score, precision_score,f1_score,roc_auc_score,auc,classification_report
from sklearn.ensemble import RandomForestClassifier

In [221]:
train=pd.read_csv("train_2v.csv")
test=pd.read_csv('test_2v.csv')

Missing Values in bmi

In [222]:
train['bmi']=train['bmi'].fillna(train['bmi'].mean())
test['bmi']=test['bmi'].fillna(test['bmi'].mean())

In [223]:
train_with_smoke=train[train['smoking_status'].notna()]
train_without_smoke=train[~train['smoking_status'].notna()]

In [224]:
train_without_smoke.shape

(13292, 12)

In [225]:
train_with_smoke.shape

(30108, 12)

In [226]:
x_without_smoke=train_without_smoke.drop(['id','stroke','smoking_status'],axis=1)
x_with_smoke=train_with_smoke.drop(['id','stroke'],axis=1)
y_without_smoke=train_without_smoke['stroke']
y_with_smoke=train_with_smoke['stroke']

Handling Labels

In [227]:
lb=LabelEncoder()

x_with_smoke['gender']=lb.fit_transform(x_with_smoke['gender'])
x_with_smoke['ever_married']=lb.fit_transform(x_with_smoke['ever_married'])
x_with_smoke['Residence_type']=lb.fit_transform(x_with_smoke['Residence_type'])


x_without_smoke['gender']=lb.fit_transform(x_without_smoke['gender'])
x_without_smoke['ever_married']=lb.fit_transform(x_without_smoke['ever_married'])
x_without_smoke['Residence_type']=lb.fit_transform(x_without_smoke['Residence_type'])

x_with_smoke=x_with_smoke.join(pd.get_dummies(x_with_smoke['smoking_status'],drop_first=True))
x_without_smoke=x_without_smoke.join(pd.get_dummies(x_without_smoke['work_type'],drop_first=True))
x_with_smoke=x_with_smoke.join(pd.get_dummies(x_with_smoke['work_type'],drop_first=True))
x_with_smoke=x_with_smoke.drop(['smoking_status','work_type'],axis=1)
x_without_smoke=x_without_smoke.drop('work_type',axis=1)


OverSampling using RandomOverSampler

In [228]:
x_oversample,y_oversample=RandomOverSampler(random_state=0).fit_resample(x_with_smoke,y_with_smoke)
x_oversample1,y_oversample1=RandomOverSampler(random_state=0).fit_resample(x_without_smoke,y_without_smoke)

Splitting  values

In [229]:
x_train,x_test,y_train,y_test=train_test_split(x_oversample,y_oversample,test_size=0.2)
x_train1,x_test1,y_train1,y_test1=train_test_split(x_oversample1,y_oversample1,test_size=0.2)

Scaling Values

In [230]:
scaler=StandardScaler()

In [231]:
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

x_train1=scaler.fit_transform(x_train1)
x_test1=scaler.transform(x_test1)

Logistic Regression

In [232]:
lr=LogisticRegression()

In [233]:
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)

lr.fit(x_train1,y_train1)
y_pred1=lr.predict(x_test1)

Scores

In [234]:
print("With Smoke recall score : {}".format(recall_score(y_test,y_pred)))
print("With Smoke precision score : {}".format(precision_score(y_test,y_pred)))
print("With Smoke f1 score : {}".format(f1_score(y_test,y_pred)))
print()
print("Classification report of with smoke",classification_report(y_test,y_pred),sep="\n")

print("Without Smoke recall score : {}".format(recall_score(y_test1,y_pred1)))
print("Without Smoke precision score : {}".format(precision_score(y_test1,y_pred1)))
print("Without Smoke f1 score : {}".format(f1_score(y_test1,y_pred1)))
print()
print("Classification report of without smoke",classification_report(y_test1,y_pred1),sep="\n")


With Smoke recall score : 0.7969380888290714
With Smoke precision score : 0.7463368520560895
With Smoke f1 score : 0.7708079082255309

Classification report of with smoke
              precision    recall  f1-score   support

           0       0.78      0.72      0.75      5844
           1       0.75      0.80      0.77      5944

    accuracy                           0.76     11788
   macro avg       0.76      0.76      0.76     11788
weighted avg       0.76      0.76      0.76     11788

Without Smoke recall score : 0.8728687916975537
Without Smoke precision score : 0.8154432132963989
Without Smoke f1 score : 0.8431793770139635

Classification report of without smoke
              precision    recall  f1-score   support

           0       0.86      0.79      0.82      2561
           1       0.82      0.87      0.84      2698

    accuracy                           0.83      5259
   macro avg       0.84      0.83      0.83      5259
weighted avg       0.83      0.83      0.83    

Random forest

In [235]:
ran=RandomForestClassifier()
ran1=RandomForestClassifier()

In [236]:
ran.fit(x_train,y_train)
y_pred=ran.predict(x_test)

ran1.fit(x_train1,y_train1)
y_pred1=ran1.predict(x_test1)

In [237]:
print("With Smoke recall score : {}".format(recall_score(y_test,y_pred)))
print("With Smoke precision score : {}".format(precision_score(y_test,y_pred)))
print("With Smoke f1 score : {}".format(f1_score(y_test,y_pred)))
print()
print("Classification report of with smoke",classification_report(y_test,y_pred),sep="\n")

print("Without Smoke recall score : {}".format(recall_score(y_test1,y_pred1)))
print("Without Smoke precision score : {}".format(precision_score(y_test1,y_pred1)))
print("Without Smoke f1 score : {}".format(f1_score(y_test1,y_pred1)))
print()
print("Classification report of without smoke",classification_report(y_test1,y_pred1),sep="\n")


With Smoke recall score : 1.0
With Smoke precision score : 0.995478144364428
With Smoke f1 score : 0.9977339488040285

Classification report of with smoke
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5844
           1       1.00      1.00      1.00      5944

    accuracy                           1.00     11788
   macro avg       1.00      1.00      1.00     11788
weighted avg       1.00      1.00      1.00     11788

Without Smoke recall score : 1.0
Without Smoke precision score : 0.9974121996303142
Without Smoke f1 score : 0.9987044234684435

Classification report of without smoke
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2561
           1       1.00      1.00      1.00      2698

    accuracy                           1.00      5259
   macro avg       1.00      1.00      1.00      5259
weighted avg       1.00      1.00      1.00      5259



Random Forest is better in this case.

Predicting on Test Data

In [238]:
test['gender']=lb.fit_transform(test['gender'])
test['ever_married']=lb.fit_transform(test['ever_married'])
test['Residence_type']=lb.fit_transform(test['Residence_type'])

test=test.join(pd.get_dummies(test['work_type'],drop_first=True))

test=test.drop(['work_type','smoking_status'],axis=1)

In [239]:
test_pred=ran1.predict(test)

ValueError: Number of features of the model must match the input. Model n_features is 12 and input n_features is 13 