### Import Libraries

In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import os

### Load Clean DataSet

In [2]:
df=pd.read_csv('clean-healthcare-categorical-dataset-stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                4908 non-null   float64
 1   hypertension       4908 non-null   int64  
 2   heart_disease      4908 non-null   int64  
 3   ever_married       4908 non-null   int64  
 4   work_type          4908 non-null   int64  
 5   avg_glucose_level  4908 non-null   float64
 6   bmi                4908 non-null   float64
 7   smoking_status     4908 non-null   int64  
 8   Male               4908 non-null   int64  
 9   Urban              4908 non-null   int64  
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 421.9 KB


## About the DataSet

### **EXTREMELY UNBALANCED**

### The "cleaned and encoded" dataset contains 4908 records
### Only 209 (4.25%) of these are positive for stroke and the rest (95.74%) are negative

### Will try to create models with different less unbalanced samples to run prediction models

### **Fortunately/Unfortunately there might not be enough positive cases to effectively predict**

### Training Models

In [3]:
models={}
models['Logistic Regression']=LogisticRegression()
models['Random Forest Clasifier']=RandomForestClassifier()
models['SVC']=SVC()

### Re-Assign and int value to the Scaled result

### DataFrame Dictionary with Proportional Positive and negative Samples

In [4]:

def create_sample_dfs_dictionary(df):
    Positives=df.loc[df['stroke']==df['stroke'].max()]
    Negative_sample_1=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=1, ignore_index=True)
    Negative_sample_2=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=3, ignore_index=True)
    Negative_sample_3=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives),random_state=5, ignore_index=True)
    DataFrames={}
    DataFrames['One to One']= pd.concat([Positives,Negative_sample_1], axis=0)
    DataFrames['One to Two']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2], axis=0)
    DataFrames['One to Three']= pd.concat([Positives,Negative_sample_1, \
                                        Negative_sample_2,Negative_sample_3], axis=0)
    return DataFrames

### Print Explained Confusion Matrix

In [5]:
def print_confusion_matrix(y_test,prediction,Labels):
    cm=confusion_matrix(y_test,prediction,labels=Labels)
    print(f'\033[1mConfusion Matrix\033[0m')
    print(f'Predicted\\Actual\tPositive\tNegative')
    print(f'Positive({cm[0][0]+cm[0][1]})\t\t{cm[0][0]}\t\t{cm[0][1]}')
    print(f'Negative({cm[1][0]+cm[1][1]})\t\t{cm[1][0]}\t\t{cm[1][1]}')

### RandomForestClassifier best depth calculator

In [6]:
def best_depth(X_train,X_test,y_train,y_test):
    # print('Calculating best depth for RandomForestClassifier')
    train_score=0
    depth=0
    best=0
    hight_score=0
    while train_score<1:
        depth+=1
        clf = RandomForestClassifier(max_depth = depth)
        clf.fit(X_train, y_train)
        train_score = clf.score(X_train,y_train)
        test_score = clf.score(X_test,y_test)
        score=test_score*(1-(train_score-test_score))
        if hight_score<score:
            hight_score=score
            best=depth
    return best      


### Create/Update afile with th results

In [35]:
def append_results(df):
    if os.path.isfile('fidel-test-results-LabelEncoded.csv'):
        df.to_csv('fidel-test-results-LabelEncoded.csv', mode='a',index=False, header=False )
    else:
        df.to_csv('fidel-test-results-LabelEncoded.csv', mode='w',index=False )

### **train_test_split and ALL the tests**

In [36]:
def Test_Results(df,compression):
    
    
    model_results={'Test_Model':[], 'Proportion':[], 'Compression':[], 'Model_Score':[],\
                   'Balanced Accuracy':[], 'Precision':[],'Recall':[],'f1-score':[]}
    X_available=df.copy().drop(columns='stroke')
    y_available=df['stroke']
    tab='   '
    DataFrames=create_sample_dfs_dictionary(df)
    for name, Learning_model in models.items():    
        print(f'\033[94m\033[1m{name}\033[0m')
        for key,df in DataFrames.items():
            X=df.copy().drop(columns='stroke')
            y=df['stroke']
            X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
            if name=='Random Forest Clasifier':
                Depth=best_depth(X_train,X_test,y_train,y_test)
                Learning_model=RandomForestClassifier(max_depth=Depth)
            model=Learning_model
            model.fit(X_train,y_train)
            predicted = model.predict(X_test)
            available_predict=model.predict(X_available)

            # Score the predictions with mse and r2
            test_balanced_accuracy=balanced_accuracy_score(y_test, predicted)
            available_balanced_accuracy=balanced_accuracy_score(y_available, available_predict)
            tcr=classification_report(y_test, predicted, labels=[1,0],output_dict=True)
            acr=classification_report(y_available, available_predict,
                                      labels=[1,0],output_dict=True)
            #region Print Results
            if name=='Random Forest Clasifier':
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items, depth of {Depth})\033[0m : ', end='')
            else:
                print(f'{tab}\033[96m\033[1m{key} ({len(df)} items)\033[0m : ', end='')
            print(f'Model Score: {model.score(X_train,y_train):.4f}')
            print(f'\t\t\t\t\t\033[1mTest Data\tAvailable Data\t\tDifference\033[0m')
            print(f"{tab}{tab}Balanced Accuracy:\t\t{test_balanced_accuracy:.4f}"+
                f"\t\t\t{available_balanced_accuracy:.4f}\t\t"+
                f"{abs(test_balanced_accuracy-available_balanced_accuracy):.4f}")
            print(f"{tab}{tab}\033[1mNegative(0) results metrics\033[0m")
            print(f"{tab}{tab}{tab}Precision\t\t\t{tcr['0']['precision']:.4f}\t\t\t"+
                  f"{acr['0']['precision']:.4f}\t\t"+
                  f"{(tcr['0']['precision']-acr['0']['precision']):.4f}")
            print(f"{tab}{tab}{tab}Recall\t\t\t\t{tcr['0']['recall']:.4f}\t\t\t"+
                  f"{acr['0']['recall']:.4f}\t\t"+
                  f"{(tcr['0']['recall']-acr['0']['recall']):.4f}")
            print(f"{tab}{tab}{tab}f1-score(confidence)\t\t{tcr['0']['f1-score']:.4f}\t\t\t"+
                  f"{acr['0']['f1-score']:.4f}\t\t"+
                  f"{(tcr['0']['f1-score']-acr['0']['f1-score']):.4f}")
            print(f"{tab}{tab}\033[1mPositive(1) results metrics\033[0m")
            print(f"{tab}{tab}{tab}Precision\t\t\t{tcr['1']['precision']:.4f}\t\t\t"+
                  f"{acr['1']['precision']:.4f}\t\t"+
                  f"{(tcr['1']['precision']-acr['1']['precision']):.4f}")
            print(f"{tab}{tab}{tab}Recall\t\t\t\t{tcr['1']['recall']:.4f}\t\t\t"+
                  f"{acr['1']['recall']:.4f}\t\t"+
                  f"{(tcr['1']['recall']-acr['1']['recall']):.4f}")
            print(f"{tab}{tab}{tab}f1-score(confidence)\t\t{tcr['1']['f1-score']:.4f}\t\t\t"+
                  f"\033[33m\033[1m{acr['1']['f1-score']:.4f}\033[0m\t\t"+
                  f"{(tcr['1']['f1-score']-acr['1']['f1-score']):.4f}")
            print()
            #endregion

            model_results['Test_Model'].append(name)
            model_results['Proportion'].append(key)
            model_results['Compression'].append(compression)
            model_results['Model_Score'].append(model.score(X_train,y_train))
            model_results['Balanced Accuracy'].append(available_balanced_accuracy)
            model_results['Precision'].append(acr['1']['precision'])
            model_results['Recall'].append(acr['1']['recall'])
            model_results['f1-score'].append(acr['1']['f1-score'])
        print('***'*10)
    model_results_df=pd.DataFrame(model_results)
    append_results(model_results_df)
            
        

### StandardScaler Scaled

In [11]:
ss=StandardScaler()
ss_scaled=ss.fit_transform(df)
ss_df=pd.DataFrame(ss_scaled, columns=df.columns)
ss_df

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,Male,Urban,stroke
0,1.069938,-0.318102,4.381499,0.729270,-0.155713,2.777797,0.981145,-0.351828,1.200240,0.985436,4.741651
1,1.646336,-0.318102,4.381499,0.729270,-0.155713,0.014016,0.459086,0.585108,1.200240,-1.014779,4.741651
2,0.271847,-0.318102,-0.228232,0.729270,-0.155713,1.484266,0.701016,1.522044,-0.833166,0.985436,4.741651
3,1.601998,3.143642,-0.228232,0.729270,0.759543,1.549325,-0.623231,0.585108,-0.833166,-1.014779,4.741651
4,1.690675,-0.318102,-0.228232,0.729270,-0.155713,1.821493,0.013426,-0.351828,1.200240,0.985436,4.741651
...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.324334,-0.318102,-0.228232,-1.371234,1.674800,-0.049918,-1.310821,-1.288764,-0.833166,-1.014779,-0.210897
4904,1.690675,-0.318102,-0.228232,0.729270,0.759543,0.448045,1.414072,0.585108,-0.833166,0.985436,-0.210897
4905,-0.348890,-0.318102,-0.228232,0.729270,0.759543,-0.502181,0.217156,0.585108,-0.833166,-1.014779,-0.210897
4906,0.360524,-0.318102,-0.228232,0.729270,-0.155713,1.373057,-0.419501,-0.351828,1.200240,-1.014779,-0.210897


### Re-assign 1 and 0 to the value results

In [12]:
ss_df['stroke']=ss_df['stroke'].apply(lambda x: 1 if x>0 else 0)
ss_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [37]:
for i in range(1,11):Test_Results(ss_df,'StandardScaler')

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7668
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.8355			0.7637		0.0717
      [1mNegative(0) results metrics[0m
         Precision			0.9020			0.9892		-0.0872
         Recall				0.7797			0.6997		0.0799
         f1-score(confidence)		0.8364			0.8196		0.0167
      [1mPositive(1) results metrics[0m
         Precision			0.7593			0.1092		0.6500
         Recall				0.8913			0.8278		0.0636
         f1-score(confidence)		0.8200			[33m[1m0.1930[0m		0.6270

   [96m[1mOne to Two (627 items)[0m : Model Score: 0.7872
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.7545			0.7437		0.0108
      [1mNegative(0) results metrics[0m
         Precision			0.8165			0.9816		-0.1651
         Recall				0.8725			0.8415		0.0311
         f1-score(confidence)		0.8436			0.9062		-0.0626
      [1mPositive(1) results metrics[0m
         Precision			0.7292			0

### MinMaxScaler

In [17]:
mms=MinMaxScaler()
mms_scaled=mms.fit_transform(df)
mms_df=pd.DataFrame(mms_scaled, columns=df.columns)
mms_df

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,Male,Urban,stroke
0,0.816895,0.0,1.0,1.0,0.50,0.801265,0.301260,0.333333,1.0,1.0,1.0
1,0.975586,0.0,1.0,1.0,0.50,0.234512,0.254296,0.666667,1.0,0.0,1.0
2,0.597168,0.0,0.0,1.0,0.50,0.536008,0.276060,1.000000,0.0,1.0,1.0
3,0.963379,1.0,0.0,1.0,0.75,0.549349,0.156930,0.666667,0.0,0.0,1.0
4,0.987793,0.0,0.0,1.0,0.50,0.605161,0.214204,0.333333,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
4903,0.157715,0.0,0.0,0.0,1.00,0.221402,0.095074,0.000000,0.0,0.0,0.0
4904,0.987793,0.0,0.0,1.0,0.75,0.323516,0.340206,0.666667,0.0,1.0,0.0
4905,0.426270,0.0,0.0,1.0,0.75,0.128658,0.232532,0.666667,0.0,0.0,0.0
4906,0.621582,0.0,0.0,1.0,0.50,0.513203,0.175258,0.333333,1.0,0.0,0.0


In [38]:
for i in range(1,11):Test_Results(mms_df,'MinMaxScaler')

[94m[1mLogistic Regression[0m
   [96m[1mOne to One (418 items)[0m : Model Score: 0.7700
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.8379			0.7663		0.0715
      [1mNegative(0) results metrics[0m
         Precision			0.9184			0.9899		-0.0716
         Recall				0.7627			0.6906		0.0721
         f1-score(confidence)		0.8333			0.8136		0.0197
      [1mPositive(1) results metrics[0m
         Precision			0.7500			0.1080		0.6420
         Recall				0.9130			0.8421		0.0709
         f1-score(confidence)		0.8235			[33m[1m0.1914[0m		0.6321

   [96m[1mOne to Two (627 items)[0m : Model Score: 0.7894
					[1mTest Data	Available Data		Difference[0m
      Balanced Accuracy:		0.7370			0.7409		0.0039
      [1mNegative(0) results metrics[0m
         Precision			0.7982			0.9808		-0.1826
         Recall				0.8922			0.8598		0.0324
         f1-score(confidence)		0.8426			0.9163		-0.0737
      [1mPositive(1) results metrics[0m
         Precision			0.7442			0

In [39]:
results_df=pd.read_csv('fidel-test-results-LabelEncoded.csv')
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score
0,Logistic Regression,One to One,StandardScaler,0.766773,0.763737,0.109217,0.827751,0.192973
1,Logistic Regression,One to Two,StandardScaler,0.787234,0.743694,0.153409,0.645933,0.247934
2,Logistic Regression,One to Three,StandardScaler,0.792663,0.697678,0.183601,0.492823,0.267532
3,Random Forest Clasifier,One to One,StandardScaler,0.757188,0.745932,0.086898,0.923445,0.158848
4,Random Forest Clasifier,One to Two,StandardScaler,0.821277,0.748328,0.172324,0.631579,0.270769
...,...,...,...,...,...,...,...,...
175,Random Forest Clasifier,One to Two,MinMaxScaler,0.817021,0.729622,0.181818,0.574163,0.276180
176,Random Forest Clasifier,One to Three,MinMaxScaler,0.794258,0.616895,0.284974,0.263158,0.273632
177,SVC,One to One,MinMaxScaler,0.773163,0.757871,0.099070,0.866029,0.177800
178,SVC,One to Two,MinMaxScaler,0.814894,0.738058,0.150113,0.636364,0.242922


In [33]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Test_Model         180 non-null    object 
 1   Proportion         180 non-null    object 
 2   Compression        180 non-null    object 
 3   Model_Score        180 non-null    float64
 4   Balanced Accuracy  180 non-null    float64
 5   Precision          180 non-null    float64
 6   Recall             180 non-null    float64
 7   f1-score           180 non-null    float64
dtypes: float64(5), object(3)
memory usage: 11.4+ KB


In [42]:
results_df.sort_values('f1-score',ascending=False)

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Encoder
59,Random Forest Clasifier,One to Three,StandardScaler,0.899522,0.757974,0.265086,0.588517,0.365527,LabelEncoder
167,Random Forest Clasifier,One to Three,MinMaxScaler,0.869219,0.728363,0.251716,0.526316,0.340557,LabelEncoder
104,Random Forest Clasifier,One to Three,MinMaxScaler,0.867624,0.729585,0.247216,0.531100,0.337386,LabelEncoder
122,Random Forest Clasifier,One to Three,MinMaxScaler,0.870813,0.731923,0.238397,0.540670,0.330893,LabelEncoder
14,Random Forest Clasifier,One to Three,StandardScaler,0.837321,0.694610,0.261972,0.444976,0.329787,LabelEncoder
...,...,...,...,...,...,...,...,...,...
3,Random Forest Clasifier,One to One,StandardScaler,0.757188,0.745932,0.086898,0.923445,0.158848,LabelEncoder
30,Random Forest Clasifier,One to One,StandardScaler,0.757188,0.744391,0.086799,0.918660,0.158612,LabelEncoder
93,Random Forest Clasifier,One to One,MinMaxScaler,0.766773,0.749649,0.086404,0.942584,0.158297,LabelEncoder
12,Random Forest Clasifier,One to One,StandardScaler,0.760383,0.745022,0.085217,0.937799,0.156238,LabelEncoder


In [None]:
results_df['Encoder']='LabelEncoder'

In [43]:
results_df.to_csv('fidel-test-results.csv', mode='w',index=False )

In [45]:
import os
os.getcwd()

'g:\\My Drive\\SMU AI\\Projects\\Stroke_Prediction-'