### Import Libraries

In [20]:
import pandas as pd
import matplotlib as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import os

### Load Clean DataSet

In [21]:
df=pd.read_csv('clean-healthcare-dataset-stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4908 non-null   float64
 1   hypertension                    4908 non-null   int64  
 2   heart_disease                   4908 non-null   int64  
 3   avg_glucose_level               4908 non-null   float64
 4   bmi                             4908 non-null   float64
 5   Male                            4908 non-null   int64  
 6   ever_married                    4908 non-null   int64  
 7   Urban                           4908 non-null   int64  
 8   work_type_Govt_job              4908 non-null   int64  
 9   work_type_Never_worked          4908 non-null   int64  
 10  work_type_Private               4908 non-null   int64  
 11  work_type_Self-employed         4908 non-null   int64  
 12  work_type_children              49

## About the DataSet

### **EXTREMELY UNBALANCED**

### The "cleaned and encoded" dataset contains 4908 records
### Only 209 (4.25%) of these are positive for stroke and the rest (95.74%) are negative

### Will try to create models with different less unbalanced samples to run prediction models

### **Fortunately/Unfortunately there might not be enough positive cases to effectively predict**

### Training Models

In [22]:
models={}
models['Logistic Regression']=LogisticRegression()
models['Random Forest Clasifier']=RandomForestClassifier()
models['SVC']=SVC()

### Re-Assign and int value to the Scaled result

### DataFrame Dictionary with Proportional Positive and negative Samples

In [23]:

def create_sample_dfs_dictionary(df):
    Positives=df.loc[df['stroke']==df['stroke'].max()]
    Negative_sample_1=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=1, ignore_index=True)
    Negative_sample_2=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=3, ignore_index=True)
    Negative_sample_3=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=5, ignore_index=True)
    DataFrames={}
    DataFrames['One to One']= pd.concat([Positives,Negative_sample_1], axis=0)
    DataFrames['One to Two']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2], axis=0)
    DataFrames['One to Three']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2,Negative_sample_3], axis=0)
    return DataFrames

### Print Explained Confusion Matrix

In [24]:
def print_confusion_matrix(y_test,prediction,Labels):
    cm=confusion_matrix(y_test,prediction,labels=Labels)
    print(f'\033[1mConfusion Matrix\033[0m')
    print(f'Predicted\\Actual\tPositive\tNegative')
    print(f'Positive({cm[0][0]+cm[0][1]})\t\t{cm[0][0]}\t\t{cm[0][1]}')
    print(f'Negative({cm[1][0]+cm[1][1]})\t\t{cm[1][0]}\t\t{cm[1][1]}')

### RandomForestClassifier best depth calculator

In [25]:
def best_depth(X_train,X_test,y_train,y_test):
    # print('Calculating best depth for RandomForestClassifier')
    train_score=0
    depth=0
    best=0
    hight_score=0
    while train_score<1:
        depth+=1
        clf = RandomForestClassifier(max_depth = depth)
        clf.fit(X_train, y_train)
        train_score = clf.score(X_train,y_train)
        test_score = clf.score(X_test,y_test)
        score=test_score*(1-(train_score-test_score))
        if hight_score<score:
            hight_score=score
            best=depth
    return best      


### Create/Update afile with th results

In [26]:
def append_results(df):
    if os.path.isfile('fidel-test-results-OneHotEncoded.csv'):
        df.to_csv('fidel-test-results-OneHotEncoded.csv', mode='a',index=False, header=False )
    else:
        df.to_csv('fidel-test-results-OneHotEncoded.csv', mode='w',index=False )

### **train_test_split and ALL the tests**

In [27]:
def Test_Results(df,compression):
    
    
    model_results={'Test_Model':[], 'Proportion':[], 'Compression':[], 'Model_Score':[],\
                   'Balanced Accuracy':[], 'Precision':[],'Recall':[],'f1-score':[]}
    X_available=df.copy().drop(columns='stroke')
    y_available=df['stroke']
    tab='   '
    DataFrames=create_sample_dfs_dictionary(df)
    for name, Learning_model in models.items():    
        print(f'\033[94m\033[1m{name}\033[0m')
        for key,df in DataFrames.items():
            X=df.copy().drop(columns='stroke')
            y=df['stroke']
            X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
            if name=='Random Forest Clasifier':
                Depth=best_depth(X_train,X_test,y_train,y_test)
                Learning_model=RandomForestClassifier(max_depth=Depth)
            model=Learning_model
            model.fit(X_train,y_train)
            predicted = model.predict(X_test)
            available_predict=model.predict(X_available)

            # Score the predictions with mse and r2
            test_balanced_accuracy=balanced_accuracy_score(y_test, predicted)
            available_balanced_accuracy=balanced_accuracy_score(y_available, available_predict)
            tcr=classification_report(y_test, predicted, labels=[1,0],output_dict=True)
            acr=classification_report(y_available, available_predict,
                                      labels=[1,0],output_dict=True)
            #region Print Results
            if name=='Random Forest Clasifier':
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items, depth of {Depth})\033[0m : ', end='')
            else:
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items)\033[0m : ', end='')
            print(f'Model Score: {model.score(X_train,y_train):.4f}')
            print(f'\t\t\t\t\t\033[1mTest Data\tAvailable Data\t\tDifference\033[0m')
            print(f"{tab}{tab}Balanced Accuracy:\t\t{test_balanced_accuracy:.4f}"+
                f"\t\t\t{available_balanced_accuracy:.4f}\t\t"+
                f"{abs(test_balanced_accuracy-available_balanced_accuracy):.4f}")
            print(f"{tab}{tab}\033[1mNegative(0) results metrics\033[0m")
            print(f"{tab}{tab}{tab}Precision\t\t\t{tcr['0']['precision']:.4f}\t\t\t"+
                  f"{acr['0']['precision']:.4f}\t\t"+
                  f"{(tcr['0']['precision']-acr['0']['precision']):.4f}")
            print(f"{tab}{tab}{tab}Recall\t\t\t\t{tcr['0']['recall']:.4f}\t\t\t"+
                  f"{acr['0']['recall']:.4f}\t\t"+
                  f"{(tcr['0']['recall']-acr['0']['recall']):.4f}")
            print(f"{tab}{tab}{tab}f1-score(confidence)\t\t{tcr['0']['f1-score']:.4f}\t\t\t"+
                  f"{acr['0']['f1-score']:.4f}\t\t"+
                  f"{(tcr['0']['f1-score']-acr['0']['f1-score']):.4f}")
            print(f"{tab}{tab}\033[1mPositive(1) results metrics\033[0m")
            print(f"{tab}{tab}{tab}Precision\t\t\t{tcr['1']['precision']:.4f}\t\t\t"+
                  f"{acr['1']['precision']:.4f}\t\t"+
                  f"{(tcr['1']['precision']-acr['1']['precision']):.4f}")
            print(f"{tab}{tab}{tab}Recall\t\t\t\t{tcr['1']['recall']:.4f}\t\t\t"+
                  f"{acr['1']['recall']:.4f}\t\t"+
                  f"{(tcr['1']['recall']-acr['1']['recall']):.4f}")
            print(f"{tab}{tab}{tab}f1-score(confidence)\t\t{tcr['1']['f1-score']:.4f}\t\t\t"+
                  f"\033[33m\033[1m{acr['1']['f1-score']:.4f}\033[0m\t\t"+
                  f"{(tcr['1']['f1-score']-acr['1']['f1-score']):.4f}")
            print()
            #endregion

            model_results['Test_Model'].append(name)
            model_results['Proportion'].append(key)
            model_results['Compression'].append(compression)
            model_results['Model_Score'].append(model.score(X_train,y_train))
            model_results['Balanced Accuracy'].append(available_balanced_accuracy)
            model_results['Precision'].append(acr['1']['precision'])
            model_results['Recall'].append(acr['1']['recall'])
            model_results['f1-score'].append(acr['1']['f1-score'])
        print('***'*10)
    model_results_df=pd.DataFrame(model_results)
    append_results(model_results_df)
                    

### StandardScaler Scaled

In [28]:
ss=StandardScaler()
ss_scaled=ss.fit_transform(df)
ss_df=pd.DataFrame(ss_scaled, columns=df.columns)
ss_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,1.069938,-0.318102,4.381499,2.777797,0.981145,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
1,1.646336,-0.318102,4.381499,0.014016,0.459086,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
2,0.271847,-0.318102,-0.228232,1.484266,0.701016,-0.833166,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,-0.778473,2.378956,4.741651
3,1.601998,3.143642,-0.228232,1.549325,-0.623231,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
4,1.690675,-0.318102,-0.228232,1.821493,0.013426,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.324334,-0.318102,-0.228232,-0.049918,-1.310821,-0.833166,-1.371234,-1.014779,-0.383751,-0.067102,-1.157312,-0.433030,2.512858,1.519706,-0.453105,-0.778473,-0.420353,-0.210897
4904,1.690675,-0.318102,-0.228232,0.448045,1.414072,-0.833166,0.729270,0.985436,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4905,-0.348890,-0.318102,-0.228232,-0.502181,0.217156,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4906,0.360524,-0.318102,-0.228232,1.373057,-0.419501,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,-0.210897


### Re-assign 1 and 0 to the value results

In [29]:
ss_df['stroke']=ss_df['stroke'].apply(lambda x: 1 if x>0 else 0)
ss_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [30]:
for i in range(1,11):Test_Results(ss_df,'StandardScaler')

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7572
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.8524			0.7576		0.0948
      [1mNegative(0) results metrics[0m
         Precision			0.9057			0.9883		-0.0827
         Recall				0.8136			0.7019		0.1117
         f1-score(confidence)		0.8571			0.8208		0.0363
      [1mPositive(1) results metrics[0m
         Precision			0.7885			0.1082		0.6803
         Recall				0.8913			0.8134		0.0779
         f1-score(confidence)		0.8367			[33m[1m0.1910[0m		0.6457

   [96m[1mOne to Two (627 items)[0m : Model Score: 0.7894
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.7272			0.7406		0.0134
      [1mNegative(0) results metrics[0m
         Precision			0.7946			0.9814		-0.1867
         Recall				0.8725			0.8400		0.0326
         f1-score(confidence)		0.8318			0.9052		-0.0734
      [1mPositive(1) results metrics[0m
         Precision			0.7111			0

   [96m[1mOne to One (418 items, depth of 2)[0m : Model Score: 0.7732
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.7979			0.7588		0.0391
      [1mNegative(0) results metrics[0m
         Precision			0.9286			0.9943		-0.0657
         Recall				0.6610			0.5942		0.0668
         f1-score(confidence)		0.7723			0.7438		0.0284
      [1mPositive(1) results metrics[0m
         Precision			0.6825			0.0919		0.5906
         Recall				0.9348			0.9234		0.0113
         f1-score(confidence)		0.7890			[33m[1m0.1672[0m		0.6218

   [96m[1mOne to Two (627 items, depth of 2)[0m : Model Score: 0.7511
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.6119			0.6277		0.0159
      [1mNegative(0) results metrics[0m
         Precision			0.7080			0.9685		-0.2604
         Recall				0.9510			0.9540		-0.0031
         f1-score(confidence)		0.8117			0.9612		-0.1495
      [1mPositive(1) results metrics[0m
         Precision			0.7500			0.2258		0

### MinMaxScaler

In [31]:
mms=MinMaxScaler()
mms_scaled=mms.fit_transform(df)
mms_df=pd.DataFrame(mms_scaled, columns=df.columns)
mms_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.816895,0.0,1.0,0.801265,0.301260,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.975586,0.0,1.0,0.234512,0.254296,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.597168,0.0,0.0,0.536008,0.276060,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.963379,1.0,0.0,0.549349,0.156930,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.987793,0.0,0.0,0.605161,0.214204,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.157715,0.0,0.0,0.221402,0.095074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4904,0.987793,0.0,0.0,0.323516,0.340206,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4905,0.426270,0.0,0.0,0.128658,0.232532,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4906,0.621582,0.0,0.0,0.513203,0.175258,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [32]:
for i in range(1,11):Test_Results(mms_df,'MinMaxScaler')

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7572
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.8185			0.7583		0.0602
      [1mNegative(0) results metrics[0m
         Precision			0.8980			0.9890		-0.0910
         Recall				0.7458			0.6889		0.0569
         f1-score(confidence)		0.8148			0.8121		0.0027
      [1mPositive(1) results metrics[0m
         Precision			0.7321			0.1058		0.6263
         Recall				0.8913			0.8278		0.0636
         f1-score(confidence)		0.8039			[33m[1m0.1876[0m		0.6163

   [96m[1mOne to Two (627 items)[0m : Model Score: 0.7957
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.7510			0.7489		0.0021
      [1mNegative(0) results metrics[0m
         Precision			0.8070			0.9817		-0.1747
         Recall				0.9020			0.8566		0.0454
         f1-score(confidence)		0.8519			0.9149		-0.0630
      [1mPositive(1) results metrics[0m
         Precision			0.7674			0

In [33]:
results_df=pd.read_csv('fidel-test-results-OneHotEncoded.csv')
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score
0,Logistic Regression,One to One,StandardScaler,0.757188,0.757624,0.108211,0.813397,0.191011
1,Logistic Regression,One to Two,StandardScaler,0.789362,0.740557,0.151242,0.641148,0.244749
2,Logistic Regression,One to Three,StandardScaler,0.797448,0.706873,0.182432,0.516746,0.269663
3,Random Forest Clasifier,One to One,StandardScaler,0.773163,0.758807,0.091905,0.923445,0.167172
4,Random Forest Clasifier,One to Two,StandardScaler,0.751064,0.627734,0.225806,0.301435,0.258197
...,...,...,...,...,...,...,...,...
175,Random Forest Clasifier,One to Two,MinMaxScaler,0.838298,0.747965,0.194785,0.607656,0.295006
176,Random Forest Clasifier,One to Three,MinMaxScaler,0.821372,0.660009,0.316239,0.354067,0.334086
177,SVC,One to One,MinMaxScaler,0.769968,0.736850,0.087828,0.880383,0.159722
178,SVC,One to Two,MinMaxScaler,0.831915,0.751730,0.171575,0.641148,0.270707


In [34]:
results_df['Encoder']='OneHotEncoder'

In [35]:
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Encoder
0,Logistic Regression,One to One,StandardScaler,0.757188,0.757624,0.108211,0.813397,0.191011,OneHotEncoder
1,Logistic Regression,One to Two,StandardScaler,0.789362,0.740557,0.151242,0.641148,0.244749,OneHotEncoder
2,Logistic Regression,One to Three,StandardScaler,0.797448,0.706873,0.182432,0.516746,0.269663,OneHotEncoder
3,Random Forest Clasifier,One to One,StandardScaler,0.773163,0.758807,0.091905,0.923445,0.167172,OneHotEncoder
4,Random Forest Clasifier,One to Two,StandardScaler,0.751064,0.627734,0.225806,0.301435,0.258197,OneHotEncoder
...,...,...,...,...,...,...,...,...,...
175,Random Forest Clasifier,One to Two,MinMaxScaler,0.838298,0.747965,0.194785,0.607656,0.295006,OneHotEncoder
176,Random Forest Clasifier,One to Three,MinMaxScaler,0.821372,0.660009,0.316239,0.354067,0.334086,OneHotEncoder
177,SVC,One to One,MinMaxScaler,0.769968,0.736850,0.087828,0.880383,0.159722,OneHotEncoder
178,SVC,One to Two,MinMaxScaler,0.831915,0.751730,0.171575,0.641148,0.270707,OneHotEncoder


In [36]:
results_df.to_csv('fidel-test-results.csv', mode='a',index=False, header=False)

'g:\\My Drive\\SMU AI\\Projects\\Stroke_Prediction-'