# **Imports**

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# **Functions**

In [28]:
def read_file(file_path):
    data = pd.read_csv(file_path, index_col=0)

    return data

In [29]:
def label_encoding(data, column, mapping):
    
    data[column] = data[column].map(mapping)

    return data[column]

# **Loading Data**

In [30]:
file_path = r"C:\Users\Space\Documents\py\Projects\TuringCollege\Stroke\Stroke_New\Data\stroke.csv"
data = read_file(file_path)
data = data.drop(columns=['ID'])

In [31]:
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work,Residence,AVG Glucose,BMI,Smoking,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# **Divide Data**

In [32]:
target_column = 'Stroke'
random_seed = 42

In [33]:
X = data.drop(columns=[target_column])  
y = data[target_column]

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_seed)

# **Impute Missing Values**

In [35]:
from feature_engine.imputation import ArbitraryNumberImputer

In [36]:
transformer = ArbitraryNumberImputer(
        variables = ['BMI'],
        arbitrary_number = 99
        )
Xt = transformer.fit(X_train)
Xt.transform(X_train)
Xt.transform(X_test)

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work,Residence,AVG Glucose,BMI,Smoking
4688,Male,31.0,0,0,No,Self-employed,Rural,64.85,23.0,Unknown
4478,Male,40.0,0,0,Yes,Self-employed,Rural,65.29,28.3,never smoked
3521,Male,52.0,0,0,Yes,Private,Rural,111.04,30.0,never smoked
4355,Female,79.0,1,0,Yes,Self-employed,Rural,76.64,19.5,never smoked
3826,Female,75.0,0,0,Yes,Govt_job,Rural,94.77,27.2,never smoked
...,...,...,...,...,...,...,...,...,...,...
818,Male,20.0,0,0,No,Govt_job,Rural,106.97,27.9,formerly smoked
4829,Male,66.0,0,0,Yes,Private,Rural,67.92,31.1,formerly smoked
611,Male,42.0,0,0,Yes,Govt_job,Urban,93.79,27.2,never smoked
3082,Female,57.0,0,0,Yes,Private,Rural,69.40,24.0,Unknown
