### Import Libraries

In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import os

### Load Clean DataSet

In [2]:
df=pd.read_csv('clean-healthcare-dataset-stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4908 non-null   float64
 1   hypertension                    4908 non-null   int64  
 2   heart_disease                   4908 non-null   int64  
 3   avg_glucose_level               4908 non-null   float64
 4   bmi                             4908 non-null   float64
 5   Male                            4908 non-null   int64  
 6   ever_married                    4908 non-null   int64  
 7   Urban                           4908 non-null   int64  
 8   work_type_Govt_job              4908 non-null   int64  
 9   work_type_Never_worked          4908 non-null   int64  
 10  work_type_Private               4908 non-null   int64  
 11  work_type_Self-employed         4908 non-null   int64  
 12  work_type_children              49

## About the DataSet

### **EXTREMELY UNBALANCED**

### The "cleaned and encoded" dataset contains 4908 records
### Only 209 (4.25%) of these are positive for stroke and the rest (95.74%) are negative

### Will try to create models with different less unbalanced samples to run prediction models

### **Fortunately/Unfortunately there might not be enough positive cases to effectively predict**

### Training Models

In [3]:
models={}
models['Logistic Regression']=LogisticRegression()
models['Random Forest Clasifier']=RandomForestClassifier()
models['SVC']=SVC()

### Re-Assign and int value to the Scaled result

### DataFrame Dictionary with Proportional Positive and negative Samples

In [4]:

def create_sample_dfs_dictionary(df):
    Positives=df.loc[df['stroke']==df['stroke'].max()]
    Negative_sample_1=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=1, ignore_index=True)
    Negative_sample_2=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=3, ignore_index=True)
    Negative_sample_3=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=5, ignore_index=True)
    DataFrames={}
    DataFrames['One to One']= pd.concat([Positives,Negative_sample_1], axis=0)
    DataFrames['One to Two']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2], axis=0)
    DataFrames['One to Three']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2,Negative_sample_3], axis=0)
    return DataFrames

### Print Explained Confusion Matrix

In [5]:
def print_confusion_matrix(y_test,prediction,Labels):
    cm=confusion_matrix(y_test,prediction,labels=Labels)
    print(f'\033[1mConfusion Matrix\033[0m')
    print(f'Predicted\\Actual\tPositive\tNegative')
    print(f'Positive({cm[0][0]+cm[0][1]})\t\t{cm[0][0]}\t\t{cm[0][1]}')
    print(f'Negative({cm[1][0]+cm[1][1]})\t\t{cm[1][0]}\t\t{cm[1][1]}')

### RandomForestClassifier best depth calculator

In [6]:
def best_depth(X_train,X_test,y_train,y_test):
    # print('Calculating best depth for RandomForestClassifier')
    train_score=0
    depth=0
    best=0
    hight_score=0
    while train_score<1:
        depth+=1
        clf = RandomForestClassifier(max_depth = depth)
        clf.fit(X_train, y_train)

        train_pred = clf.predict(X_train)
        test_pred = clf.predict(X_test)

        train_score = balanced_accuracy_score(y_train, train_pred)
        test_score = balanced_accuracy_score(y_test, test_pred)
        score=test_score*(1-(train_score-test_score))
        if hight_score<score:
            hight_score=score
            best=depth
    return best      


In [7]:
# DataFrames=create_sample_dfs_dictionary(df)
for name, Learning_model in models.items():
    print(name)

Logistic Regression
Random Forest Clasifier
SVC


### **train_test_split and ALL the tests**

In [8]:
def Test_Results(df):
    X_available=df.copy().drop(columns='stroke')
    y_available=df['stroke']
    tab='   '
    DataFrames=create_sample_dfs_dictionary(df)
    for name, Learning_model in models.items():    
        print(f'\033[94m\033[1m{name}\033[0m')
        for key,df in DataFrames.items():
            X=df.copy().drop(columns='stroke')
            y=df['stroke']
            X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
            if name=='Random Forest Clasifier':
                Depth=best_depth(X_train,X_test,y_train,y_test)
                Learning_model=RandomForestClassifier(max_depth=Depth)
            model=Learning_model
            model.fit(X_train,y_train)
            predicted = model.predict(X_test)
            available_predict=model.predict(X_available)

            # Score the predictions with mse and r2
            test_mse = mean_squared_error(y_test, predicted)
            test_r2 = r2_score(y_test, predicted)
            available_mse=mean_squared_error(y_available, available_predict)
            available_r2=r2_score(y_available, available_predict)
            test_balanced_accuracy=balanced_accuracy_score(y_test, predicted)
            available_balanced_accuracy=balanced_accuracy_score(y_available, available_predict)

            #region Print Results
            if name=='Random Forest Clasifier':
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items, depth of {Depth})\033[0m : ', end='')
            else:
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items)\033[0m : ', end='')
            print(f'Model Score: {model.score(X_train,y_train):.4f}')
            print(f'\t\t\t\t\t\033[1mTest Data\tAvailable Data\t\tDifference\033[0m')
            print(f"{tab}{tab}Mean Squared Error (MSE):\t\t{test_mse:.4f}"+
                f"\t\t{available_mse:.4f}\t\t\t{abs(test_mse-available_mse):.4f}")
            print(f"{tab}{tab}R-squared (R2):\t\t\t{test_r2:.4f}\t\t{available_r2:.4f}"+
                f"\t\t\t{abs(test_r2-available_r2):.4f}")
            print(f"{tab}{tab}Balanced Accuracy:\t\t{test_balanced_accuracy:.4f}"+
                f"\t\t{available_balanced_accuracy:.4f}\t\t\t"+
                f"{abs(test_balanced_accuracy-available_balanced_accuracy):.4f}")
            print()
            print(f'\033[1mClassification Report on Test Data\033[0m')
            print(f'{classification_report(y_test, predicted,labels=[1,0])}')
            print(f'\033[1mClassification Report on Available Data\033[0m')
            print(f'{classification_report(y_available, available_predict,labels=[1,0])}')

            print_confusion_matrix(y_test, predicted,[1,0])

            print()
            #endregion
            
        print('***'*10)

### StandardScaler Scaled

In [9]:
ss=StandardScaler()
ss_scaled=ss.fit_transform(df)
ss_df=pd.DataFrame(ss_scaled, columns=df.columns)
ss_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,1.069938,-0.318102,4.381499,2.777797,0.981145,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
1,1.646336,-0.318102,4.381499,0.014016,0.459086,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
2,0.271847,-0.318102,-0.228232,1.484266,0.701016,-0.833166,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,-0.778473,2.378956,4.741651
3,1.601998,3.143642,-0.228232,1.549325,-0.623231,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
4,1.690675,-0.318102,-0.228232,1.821493,0.013426,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.324334,-0.318102,-0.228232,-0.049918,-1.310821,-0.833166,-1.371234,-1.014779,-0.383751,-0.067102,-1.157312,-0.433030,2.512858,1.519706,-0.453105,-0.778473,-0.420353,-0.210897
4904,1.690675,-0.318102,-0.228232,0.448045,1.414072,-0.833166,0.729270,0.985436,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4905,-0.348890,-0.318102,-0.228232,-0.502181,0.217156,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4906,0.360524,-0.318102,-0.228232,1.373057,-0.419501,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,-0.210897


### Re-assign 1 and 0 to the value results

In [10]:
ss_df['stroke']=ss_df['stroke'].apply(lambda x: 1 if x>0 else 0)
ss_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [11]:
Test_Results(ss_df)

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7572
					[1mTest Data	Available Data		Difference[0m
      Mean Squared Error (MSE):		0.1524		0.2934			0.1410
      R-squared (R2):			0.3810		-6.1964			6.5774
      Balanced Accuracy:		0.8524		0.7576			0.0948

[1mClassification Report on Test Data[0m
              precision    recall  f1-score   support

           1       0.79      0.89      0.84        46
           0       0.91      0.81      0.86        59

    accuracy                           0.85       105
   macro avg       0.85      0.85      0.85       105
weighted avg       0.85      0.85      0.85       105

[1mClassification Report on Available Data[0m
              precision    recall  f1-score   support

           1       0.11      0.81      0.19       209
           0       0.99      0.70      0.82      4699

    accuracy                           0.71      4908
   macro avg       0.55      0.76      0.51      4908
weighted 

   [96m[1mOne to One (418 items, depth of 3)[0m : Model Score: 0.7764
					[1mTest Data	Available Data		Difference[0m
      Mean Squared Error (MSE):		0.2286		0.4004			0.1718
      R-squared (R2):			0.0715		-8.8201			8.8916
      Balanced Accuracy:		0.7894		0.7543			0.0351

[1mClassification Report on Test Data[0m
              precision    recall  f1-score   support

           1       0.67      0.93      0.78        46
           0       0.93      0.64      0.76        59

    accuracy                           0.77       105
   macro avg       0.80      0.79      0.77       105
weighted avg       0.82      0.77      0.77       105

[1mClassification Report on Available Data[0m
              precision    recall  f1-score   support

           1       0.09      0.92      0.16       209
           0       0.99      0.59      0.74      4699

    accuracy                           0.60      4908
   macro avg       0.54      0.75      0.45      4908
weighted avg       0.96      0

### MinMaxScaler

In [12]:
mms=MinMaxScaler()
mms_scaled=mms.fit_transform(df)
mms_df=pd.DataFrame(mms_scaled, columns=df.columns)
mms_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.816895,0.0,1.0,0.801265,0.301260,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.975586,0.0,1.0,0.234512,0.254296,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.597168,0.0,0.0,0.536008,0.276060,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.963379,1.0,0.0,0.549349,0.156930,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.987793,0.0,0.0,0.605161,0.214204,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.157715,0.0,0.0,0.221402,0.095074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4904,0.987793,0.0,0.0,0.323516,0.340206,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4905,0.426270,0.0,0.0,0.128658,0.232532,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4906,0.621582,0.0,0.0,0.513203,0.175258,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
Test_Results(mms_df)

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7572
					[1mTest Data	Available Data		Difference[0m
      Mean Squared Error (MSE):		0.1905		0.3052			0.1147
      R-squared (R2):			0.2262		-6.4863			6.7125
      Balanced Accuracy:		0.8185		0.7583			0.0602

[1mClassification Report on Test Data[0m
              precision    recall  f1-score   support

           1       0.73      0.89      0.80        46
           0       0.90      0.75      0.81        59

    accuracy                           0.81       105
   macro avg       0.82      0.82      0.81       105
weighted avg       0.83      0.81      0.81       105

[1mClassification Report on Available Data[0m
              precision    recall  f1-score   support

           1       0.11      0.83      0.19       209
           0       0.99      0.69      0.81      4699

    accuracy                           0.69      4908
   macro avg       0.55      0.76      0.50      4908
weighted 