In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOADING DATA AND EDA

In [3]:
df = pd.read_csv('/kaggle/input/full-filled-brain-stroke-dataset/full_data.csv')

In [4]:
df.head()

In [5]:
df.info()

In [6]:
df.describe()

# VISUALIZATION

* CATEGORICAL VISUALIZATION

In [7]:
df.columns

In [8]:
sns.countplot(x='stroke', data=df)

In [9]:
sns.countplot(x='gender', data=df, hue='stroke')

In [10]:
sns.countplot(x='ever_married', data=df, hue='stroke')

In [11]:
df['work_type'].value_counts()

In [12]:
sns.countplot(x='work_type', data=df, hue='stroke')

In [13]:
sns.countplot(x='Residence_type', data=df, hue='stroke')

In [14]:
sns.countplot(x='smoking_status', data=df, hue='stroke')

* NUMERICAL VISUALIZATION

In [15]:
sns.heatmap(df.drop(['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status'], axis=1).corr()[['stroke']], annot=True)

In [16]:
sns.histplot(x='age', data=df, hue='stroke', kde=True)

In [17]:
sns.histplot(x='bmi', data=df, hue='stroke', kde=True)

In [18]:
sns.histplot(x='avg_glucose_level', data=df, hue='stroke', kde=True)

# DATA PREPROCESSING

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
X_cat = df[['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status']]
X_num = df.drop(['gender', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'smoking_status', 'stroke'], axis=1)

In [21]:
X_cat = pd.get_dummies(X_cat)

In [22]:
X_cat.head()

In [23]:
scaler = StandardScaler()
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)
X = pd.concat([X_scaled, X_cat], axis=1)
y=df['stroke']

In [24]:
X.head()

# MODEL BUILDING

In [25]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [27]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)

In [28]:
pred = model_xgb.predict(X_test)
print(classification_report(y_test, pred))

# MODEL FEATURE IMPORTANCES

In [29]:
fig = plt.figure(figsize=(15,8))
plt.barh(X.columns, model_xgb.feature_importances_)
plt.show()