# **IMPORTS**

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

# **Get the Data**

In [21]:
data = pd.read_csv("data\healthcare-dataset-stroke-data.csv")

In [22]:
data.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [23]:
data.shape

(5110, 12)

In [24]:
data.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [42]:
data = data.drop(["id"],axis=1)

# **Removing Nulls by predicting**

In [43]:
feature_with_missing_values =  "bmi"

In [44]:
complete_data = data.dropna()
incomplete_data = data[data.isnull().any(axis=1)]

In [45]:
incomplete_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...
5039,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


## **Encoding**

In [46]:
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numerical_features = [col for col in complete_data.columns if col not in categorical_features and col != 'bmi']

In [47]:
encoder = OneHotEncoder()
X_cat_encoded = encoder.fit_transform(complete_data[categorical_features])

In [48]:
X_cat_encoded_dataframe =pd.DataFrame(X_cat_encoded.toarray())

In [49]:
X_cat_encoded_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
4905,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4906,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4907,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [50]:
X_null = pd.concat([X_cat_encoded_dataframe, complete_data[numerical_features]], axis=1)
Y_null = complete_data[feature_with_missing_values]

In [51]:
Y_null

0       36.6
2       32.5
3       34.4
4       24.0
5       29.0
        ... 
5104    18.6
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 4909, dtype: float64

In [52]:
X_null

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,age,hypertension,heart_disease,avg_glucose_level,stroke
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,67.0,0.0,1.0,228.69,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,,,,,
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,80.0,0.0,1.0,105.92,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,49.0,0.0,0.0,171.23,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,79.0,1.0,0.0,174.12,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,,,,,,,,,,,...,,,,,,13.0,0.0,0.0,103.08,0.0
5106,,,,,,,,,,,...,,,,,,81.0,0.0,0.0,125.20,0.0
5107,,,,,,,,,,,...,,,,,,35.0,0.0,0.0,82.99,0.0
5108,,,,,,,,,,,...,,,,,,51.0,0.0,0.0,166.29,0.0


In [53]:
X_null.isna().sum()

0                    192
1                    192
2                    192
3                    192
4                    192
5                    192
6                    192
7                    192
8                    192
9                    192
10                   192
11                   192
12                   192
13                   192
14                   192
15                   192
age                  192
hypertension         192
heart_disease        192
avg_glucose_level    192
stroke               192
dtype: int64

## **Split**

In [32]:
X_train_null, X_test_null, y_train_null, y_test_null = train_test_split(X_null, Y_null, test_size=0.2)

ValueError: Found input variables with inconsistent numbers of samples: [5101, 4909]

### **Model**

In [None]:
model = RandomForestRegressor()
model.fit(X_train_null, y_train_null)