In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset=pd.read_csv('brain_stroke.csv')

In [3]:
dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [4]:
dataset.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
gender               4981 non-null object
age                  4981 non-null float64
hypertension         4981 non-null int64
heart_disease        4981 non-null int64
ever_married         4981 non-null object
work_type            4981 non-null object
Residence_type       4981 non-null object
avg_glucose_level    4981 non-null float64
bmi                  4981 non-null float64
smoking_status       4981 non-null object
stroke               4981 non-null int64
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [6]:
dataset["stroke"].value_counts()

0    4733
1     248
Name: stroke, dtype: int64

In [7]:
#checking for total null values
dataset.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
# If we see any feature unrelated to label, we can just drop them before starting preprocessing

dataset = dataset.drop(["ever_married", "work_type", "Residence_type"], axis =1)

In [9]:
dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,105.92,32.5,never smoked,1
2,Female,49.0,0,0,171.23,34.4,smokes,1
3,Female,79.0,1,0,174.12,24.0,never smoked,1
4,Male,81.0,0,0,186.21,29.0,formerly smoked,1


In [10]:
quan=[]
qual=[]
for columnName in dataset.columns:
    print(columnName)
    if(dataset[columnName].dtype=='O'):
        print("qual")
        qual.append(columnName)
    else:
        print("quan")
        quan.append(columnName)

gender
qual
age
quan
hypertension
quan
heart_disease
quan
avg_glucose_level
quan
bmi
quan
smoking_status
qual
stroke
quan


In [11]:
quan

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

In [12]:
qual

['gender', 'smoking_status']

In [13]:
#Perform descriptive analysis

descriptive=pd.DataFrame(index=["Mean","Median","Mode","Q1=25%","Q2=50%",
                                "Q3=75%","99%","Q4=100%","IQR","1.5rule","Lesser","Greater","Min","Max"],columns=quan)
for columnName in quan:    
    descriptive[columnName]["Mean"]=dataset[columnName].mean()
    descriptive[columnName]["Median"]=dataset[columnName].median()
    descriptive[columnName]["Mode"]=dataset[columnName].mode()[0]
    descriptive[columnName]["Q1=25%"]=dataset.describe()[columnName]["25%"]
    descriptive[columnName]["Q2=50%"]=dataset.describe()[columnName]["50%"]
    descriptive[columnName]["Q3=75%"]=dataset.describe()[columnName]["75%"]
    descriptive[columnName]["99%"]=np.percentile(dataset[columnName],99)
    descriptive[columnName]["Q4=100%"]=dataset.describe()[columnName]["max"]
    descriptive[columnName]["IQR"]=descriptive[columnName]["Q3=75%"]-descriptive[columnName]["Q1=25%"]
    descriptive[columnName]["1.5rule"]=1.5*descriptive[columnName]["IQR"]
    descriptive[columnName]["Lesser"]=descriptive[columnName]["Q1=25%"]-descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Greater"]=descriptive[columnName]["Q3=75%"]+descriptive[columnName]["1.5rule"]
    descriptive[columnName]["Min"]=dataset[columnName].min()
    descriptive[columnName]["Max"]=dataset[columnName].max()

In [14]:
descriptive

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
Mean,43.4199,0.0961654,0.0552098,105.944,28.4982,0.0497892
Median,45.0,0.0,0.0,91.85,28.1,0.0
Mode,78.0,0.0,0.0,93.88,28.7,0.0
Q1=25%,25.0,0.0,0.0,77.23,23.7,0.0
Q2=50%,45.0,0.0,0.0,91.85,28.1,0.0
Q3=75%,61.0,0.0,0.0,113.86,32.6,0.0
99%,82.0,1.0,1.0,240.82,45.5,1.0
Q4=100%,82.0,1.0,1.0,271.74,48.9,1.0
IQR,36.0,0.0,0.0,36.63,8.9,0.0
1.5rule,54.0,0.0,0.0,54.945,13.35,0.0


In [15]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(dataset[quan])
df=imp.transform(dataset[quan])
df = pd.DataFrame(df,columns = quan)

In [16]:
df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,67.0,0.0,1.0,228.69,36.6,1.0
1,80.0,0.0,1.0,105.92,32.5,1.0
2,49.0,0.0,0.0,171.23,34.4,1.0
3,79.0,1.0,0.0,174.12,24.0,1.0
4,81.0,0.0,0.0,186.21,29.0,1.0
...,...,...,...,...,...,...
4976,41.0,0.0,0.0,70.15,29.8,0.0
4977,40.0,0.0,0.0,191.15,31.1,0.0
4978,45.0,1.0,0.0,95.02,31.8,0.0
4979,40.0,0.0,0.0,83.94,30.0,0.0


In [17]:
cate=dataset[qual]

In [18]:
cate

Unnamed: 0,gender,smoking_status
0,Male,formerly smoked
1,Male,never smoked
2,Female,smokes
3,Female,never smoked
4,Male,formerly smoked
...,...,...
4976,Male,formerly smoked
4977,Male,smokes
4978,Female,smokes
4979,Male,smokes


In [19]:
#Merging Qual and Qual data

two=[df,cate]

In [20]:
#using concat

preprocessed=pd.concat(two,axis=1)

In [21]:
preprocessed

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,smoking_status
0,67.0,0.0,1.0,228.69,36.6,1.0,Male,formerly smoked
1,80.0,0.0,1.0,105.92,32.5,1.0,Male,never smoked
2,49.0,0.0,0.0,171.23,34.4,1.0,Female,smokes
3,79.0,1.0,0.0,174.12,24.0,1.0,Female,never smoked
4,81.0,0.0,0.0,186.21,29.0,1.0,Male,formerly smoked
...,...,...,...,...,...,...,...,...
4976,41.0,0.0,0.0,70.15,29.8,0.0,Male,formerly smoked
4977,40.0,0.0,0.0,191.15,31.1,0.0,Male,smokes
4978,45.0,1.0,0.0,95.02,31.8,0.0,Female,smokes
4979,40.0,0.0,0.0,83.94,30.0,0.0,Male,smokes


In [22]:
csv=dataset.to_csv("Preprocessed.csv",index=False)