<a href="https://colab.research.google.com/github/MohamedElhadidy99/Stroke-prediction-using-Python/blob/main/stroke.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#read the csv file of dataset
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['stroke dataset.csv']))


In [None]:
#know the shape and description of dataset
df.shape 
df.columns

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#check missing values
df.isna().sum()

In [None]:
#replace missing values in BMI with average
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [None]:
df.head()

In [None]:
#change the age to integer
df.dtypes
df["age"] = df["age"].astype(float).astype(int)
df.dtypes
df.head()

In [None]:
#Some statistics and percentages
df['gender'].value_counts(normalize=True)
df = df[df.gender != 'Other']
df['gender'].value_counts(normalize=True)

In [None]:
df['work_type'].value_counts(normalize=True)

In [None]:
df['smoking_status'].value_counts(normalize=True)

In [None]:
#exploratory alaysis
def barPerc(df,xVar,ax):
    '''
    barPerc(): Add percentage for hues to bar plots
    args:
        df: pandas dataframe
        xVar: (string) X variable 
        ax: Axes object (for Seaborn Countplot/Bar plot or
                         pandas bar plot)
    '''
    # 1. how many X categories
    ##   check for NaN and remove
    numX=len([x for x in df[xVar].unique() if x==x])

    # 2. The bars are created in hue order, organize them
    bars = ax.patches
    ## 2a. For each X variable
    for ind in range(numX):
        ## 2b. Get every hue bar
        ##     ex. 8 X categories, 4 hues =>
        ##    [0, 8, 16, 24] are hue bars for 1st X category
        hueBars=bars[ind:][::numX]
        ## 2c. Get the total height (for percentages)
        total = sum([x.get_height() for x in hueBars])

        # 3. Print the percentage on the bars
        for bar in hueBars:
            ax.text(bar.get_x() + bar.get_width()/2.,
                    bar.get_height(),
                    f'{bar.get_height()/total:.0%}',
                    ha="center",va="bottom")
plt.figure(figsize=(8,8))
sns.set(style="darkgrid")
stroke_smoke=sns.countplot(x='stroke',hue='smoking_status',data=df)
barPerc(df,'stroke',stroke_smoke)
plt.show()

plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_gender=sns.countplot(x='stroke',hue='gender',data=df)
barPerc(df,'stroke',stroke_gender)
plt.show()

plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_hypertension=sns.countplot(x='stroke',hue='hypertension',data=df)
barPerc(df,'stroke',stroke_hypertension)
plt.show()


plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_heart_disease=sns.countplot(x='stroke',hue='heart_disease',data=df)
barPerc(df,'stroke',stroke_heart_disease)
plt.show()


plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_ever_married=sns.countplot(x='stroke',hue='ever_married',data=df)
barPerc(df,'stroke',stroke_ever_married)
plt.show()



plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_work_type=sns.countplot(x='stroke',hue='work_type',data=df)
plt.show()


plt.figure(figsize=(7,7))
sns.set(style="darkgrid")
stroke_Residence_type=sns.countplot(x='stroke',hue='Residence_type',data=df)
barPerc(df,'stroke',stroke_Residence_type)
plt.show()

In [None]:
#Stroke distribution by age
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
ax.axes.get_yaxis().set_visible(False)
ax.text(-5, 0.03, 'Stroke', {'size': '20',
                             'weight': 'bold',
                             'color': 'red'}, alpha=0.9)

ax.text(15, 0.03, 'Healthy', {'size': '20',
                             'weight': 'bold',
                             'color': 'blue'}, alpha=0.9)

plt.title('Stroke distribution by Age',size=15)
sns.kdeplot(data=df[df.stroke == 1],
            x='age', shade=True,color='red')
sns.kdeplot(data=df[df.stroke == 0],
            x='age', shade=True,color='blue', alpha=0.7)
plt.show()

In [None]:
#measure association between stroke different variables
!pip install researchpy
import researchpy
#to obtaion degrees of freedom
contTable=pd.crosstab(df['heart_disease'], df['stroke'])
deg_freedom = min(contTable.shape[0], contTable.shape[1]) - 1
deg_freedom


In [None]:
crosstab, res = researchpy.crosstab(df['heart_disease'], df['stroke'], test='chi-square')
res

In [None]:
#measure association between stroke and hypertension
crosstab, res = researchpy.crosstab(df['hypertension'], df['stroke'], test='chi-square')
res

In [None]:
crosstab, res = researchpy.crosstab(df['gender'], df['stroke'], test='chi-square')
res

In [None]:
#measure association between stroke and marital status
crosstab, res = researchpy.crosstab(df['ever_married'], df['stroke'], test='chi-square')
res

In [None]:
#measure association between stroke and work
crosstab, res = researchpy.crosstab(df['work_type'], df['stroke'], test='chi-square')
res

In [None]:
df.head(20)

In [None]:
#recoding categorical variables
from sklearn import preprocessing
cat_columns = [cname for cname in df.columns
                        if df[cname].dtype == 'object']

encoder = preprocessing.LabelEncoder()

for col in cat_columns:
    df[col] = encoder.fit_transform(df[col])

df.head(20)

In [None]:
# Fitting different models
#import necessary modules
from sklearn import metrics
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report, confusion_matrix,roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
y=df['stroke']
x=df.drop(['stroke'],axis=1)

#first, logistic regression model
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

# Setup the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space}
# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression(max_iter= 10000)
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
# Fit it to the data
logreg_cv.fit(X_train,y_train)



In [None]:
#measure auc score for the logistic regression model
logreg_auc=cross_val_score(logreg_cv,X_train,y_train,scoring='roc_auc').mean()
print("AUC of logistic regression model: {}".format (logreg_auc))


In [None]:
#Score of loistic regression model
y_predict_log=logreg_cv.predict(X_test)
print("score of logistic regression model: {}".format (logreg_cv.score(X_test, y_test)))

In [None]:
#ROC curve for logistic model
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg_cv.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
#import KNN and fiure out its best parameters
from sklearn.neighbors import KNeighborsClassifier
grid_params={ 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan'], 'algorithm': ['auto','brute','kd_tree','ball_tree' ]} 
knn=KNeighborsClassifier()
knn_cv = GridSearchCV(knn, grid_params, cv=5)
knn_cv.fit(x,y)
print("Tuned KNN Parameters: {}".format(knn_cv.best_params_)) 

In [None]:
#Use KNN model and measure its score
KNN=KNeighborsClassifier(algorithm='auto', metric='minkowski',n_neighbors=5, weights='uniform')
KNN.fit(X_train,y_train)
y_pred_knn=KNN.predict(X_test)
print("score of KNN model: {}".format (KNN.score(X_test, y_test)))

In [None]:
#measure AUC for KNN model
knn_auc=cross_val_score(KNN,X_train,y_train,scoring='roc_auc').mean()
print("AUC of knn model: {}".format (knn_auc))

In [None]:
#ROC curve for knn model
# Compute predicted probabilities: y_pred_prob
y_pred_prob_knn = KNN.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_knn)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
#Random forest model
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
param_grids_RF={ 'n_estimators' : [100,200,300,400,500,600],
               'criterion' : ['gini','entropy'],
               'max_features' : ['auto','sqrt','log2']}
rf1 = GridSearchCV(rf, param_grids_RF,cv=5)
rf1.fit(x,y)
print("Tuned RF Parameters: {}".format(rf1.best_params_)) 

In [None]:
#Apply Random forest model and measure its score
RF=RandomForestClassifier(n_estimators=300, criterion='gini',max_features='log2',random_state=1)
RF.fit(X_train,y_train)
y_pred_RF=RF.predict(X_test)
print("Score of RF model: {}".format (RF.score(X_test, y_test)))

In [None]:
#Measure AUC for Random Forest model
RF_auc=cross_val_score(RF,X_train,y_train,scoring='roc_auc').mean()
print("AUC of RF model: {}".format (RF_auc))

In [None]:
#ROC curve for KNN model
# Compute predicted probabilities: y_pred_prob
y_pred_prob_RF = RF.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_RF)
# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
