### Import Libraries

In [80]:
import pandas as pd
import matplotlib as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import os

### Load Clean DataSet

In [81]:
df=pd.read_csv('clean-healthcare-dataset-stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             4908 non-null   float64
 1   hypertension                    4908 non-null   int64  
 2   heart_disease                   4908 non-null   int64  
 3   avg_glucose_level               4908 non-null   float64
 4   bmi                             4908 non-null   float64
 5   Male                            4908 non-null   int64  
 6   ever_married                    4908 non-null   int64  
 7   Urban                           4908 non-null   int64  
 8   work_type_Govt_job              4908 non-null   int64  
 9   work_type_Never_worked          4908 non-null   int64  
 10  work_type_Private               4908 non-null   int64  
 11  work_type_Self-employed         4908 non-null   int64  
 12  work_type_children              49

## About the DataSet

### **EXTREMELY UNBALANCED**

### The "cleaned and encoded" dataset contains 4908 records
### Only 209 (4.25%) of these are positive for stroke and the rest (95.74%) are negative

### Will try to create models with different less unbalanced samples to run prediction models

### **Fortunately/Unfortunately there might not be enough positive cases to effectively predict**

### Training Models

In [82]:
models={}
models['Logistic Regression']=LogisticRegression()
models['Random Forest Clasifier']=RandomForestClassifier()
models['SVC']=SVC()

### Re-Assign and int value to the Scaled result

### DataFrame Dictionary with Proportional Positive and negative Samples

In [83]:
def create_sample_dfs_dictionary(df,number_of_samples):
    DataFrames={}
    Positives=df.loc[df['stroke']==df['stroke'].max()]
    for i in range(1,number_of_samples+1):
        Negative_sample=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives)*i, ignore_index=True)
        DataFrames[f'Ratio 1 to {i}']= pd.concat([Positives,Negative_sample], axis=0)
    return DataFrames

### Print Explained Confusion Matrix

In [84]:
def print_confusion_matrix(y_test,prediction,Labels):
    cm=confusion_matrix(y_test,prediction,labels=Labels)
    print(f'\033[1mConfusion Matrix\033[0m')
    print(f'Predicted\\Actual\tPositive\tNegative')
    print(f'Positive({cm[0][0]+cm[0][1]})\t\t{cm[0][0]}\t\t{cm[0][1]}')
    print(f'Negative({cm[1][0]+cm[1][1]})\t\t{cm[1][0]}\t\t{cm[1][1]}')

### RandomForestClassifier best depth calculator

In [85]:
def best_depth(X_train,X_test,y_train,y_test):
    train_score=0
    depth=0
    best=0
    hight_score=0
    while train_score<1:
        depth+=1
        clf = RandomForestClassifier(max_depth = depth)
        clf.fit(X_train, y_train)
        train_score = clf.score(X_train,y_train)
        test_score = clf.score(X_test,y_test)
        score=test_score*(1-(train_score-test_score))
        if hight_score<score:
            hight_score=score
            best=depth
    return best      


### Create/Update afile with th results

In [86]:
def append_results(df):
    if os.path.isfile('fidel-test-results-OneHotEncoded.csv'):
        df.to_csv('fidel-test-results-OneHotEncoded.csv', mode='a',index=False, header=False )
    else:
        df.to_csv('fidel-test-results-OneHotEncoded.csv', mode='w',index=False )

### **train_test_split and ALL the tests**

In [87]:
def Test_Results(df,compression,number_of_samples):    
    model_results={'Test_Model':[], 'Proportion':[], 'Compression':[], 'Model_Score':[],\
                   'Balanced Accuracy':[], 'Precision':[],'Recall':[],'f1-score':[],\
                    'Confusion TP':[], 'Confusion FP':[], 'Confusion FN':[],'Confusion TN':[]}
    X_available=df.copy().drop(columns='stroke')
    y_available=df['stroke']
    tab='   '
    DataFrames=create_sample_dfs_dictionary(df,number_of_samples)
    for name, Learning_model in models.items():    
        # print(f'\033[94m\033[1m{name}\033[0m')
        for key,df in DataFrames.items():
            X=df.copy().drop(columns='stroke')
            y=df['stroke']
            X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
            if name=='Random Forest Clasifier':
                Depth=best_depth(X_train,X_test,y_train,y_test)
                Learning_model=RandomForestClassifier(max_depth=Depth)
            model=Learning_model
            model.fit(X_train,y_train)
            available_predict=model.predict(X_available)
            available_balanced_accuracy=balanced_accuracy_score(y_available, available_predict)
            acr=classification_report(y_available, available_predict,
                                      labels=[1,0],output_dict=True)
            cm=confusion_matrix(y_available,available_predict,labels=[1,0])
            model_results['Test_Model'].append(name)
            model_results['Proportion'].append(key)
            model_results['Compression'].append(compression)
            model_results['Model_Score'].append(model.score(X_train,y_train))
            model_results['Balanced Accuracy'].append(available_balanced_accuracy)
            model_results['Precision'].append(acr['1']['precision'])
            model_results['Recall'].append(acr['1']['recall'])
            model_results['f1-score'].append(acr['1']['f1-score'])
            model_results['Confusion TP'].append(cm[0][0])
            model_results['Confusion FP'].append(cm[0][1])
            model_results['Confusion FN'].append(cm[1][0])
            model_results['Confusion TN'].append(cm[1][1])

    
    model_results_df=pd.DataFrame(model_results)
    return(model_results_df)

### StandardScaler Scaled

In [88]:
ss=StandardScaler()
ss_scaled=ss.fit_transform(df)
ss_df=pd.DataFrame(ss_scaled, columns=df.columns)
ss_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,1.069938,-0.318102,4.381499,2.777797,0.981145,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
1,1.646336,-0.318102,4.381499,0.014016,0.459086,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
2,0.271847,-0.318102,-0.228232,1.484266,0.701016,-0.833166,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,-0.453105,-0.778473,2.378956,4.741651
3,1.601998,3.143642,-0.228232,1.549325,-0.623231,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,4.741651
4,1.690675,-0.318102,-0.228232,1.821493,0.013426,1.200240,0.729270,0.985436,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,4.741651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.324334,-0.318102,-0.228232,-0.049918,-1.310821,-0.833166,-1.371234,-1.014779,-0.383751,-0.067102,-1.157312,-0.433030,2.512858,1.519706,-0.453105,-0.778473,-0.420353,-0.210897
4904,1.690675,-0.318102,-0.228232,0.448045,1.414072,-0.833166,0.729270,0.985436,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4905,-0.348890,-0.318102,-0.228232,-0.502181,0.217156,-0.833166,0.729270,-1.014779,-0.383751,-0.067102,-1.157312,2.309308,-0.397953,-0.658022,-0.453105,1.284565,-0.420353,-0.210897
4906,0.360524,-0.318102,-0.228232,1.373057,-0.419501,1.200240,0.729270,-1.014779,-0.383751,-0.067102,0.864071,-0.433030,-0.397953,-0.658022,2.206992,-0.778473,-0.420353,-0.210897


### Re-assign 1 and 0 to the value results

In [89]:
ss_df['stroke']=ss_df['stroke'].apply(lambda x: 1 if x>0 else 0)
ss_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [90]:
for i in range(1,21):
    append_results(Test_Results(ss_df,'StandardScaler',7))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### MinMaxScaler

In [91]:
mms=MinMaxScaler()
mms_scaled=mms.fit_transform(df)
mms_df=pd.DataFrame(mms_scaled, columns=df.columns)
mms_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,Male,ever_married,Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.816895,0.0,1.0,0.801265,0.301260,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.975586,0.0,1.0,0.234512,0.254296,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.597168,0.0,0.0,0.536008,0.276060,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.963379,1.0,0.0,0.549349,0.156930,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.987793,0.0,0.0,0.605161,0.214204,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4903,0.157715,0.0,0.0,0.221402,0.095074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4904,0.987793,0.0,0.0,0.323516,0.340206,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4905,0.426270,0.0,0.0,0.128658,0.232532,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4906,0.621582,0.0,0.0,0.513203,0.175258,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [92]:
for i in range(1,21):
    append_results(Test_Results(mms_df,'MinMaxScaler',7))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [93]:
results_df=pd.read_csv('fidel-test-results-OneHotEncoded.csv')
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Confusion TP,Confusion FP,Confusion FN,Confusion TN
0,Logistic Regression,Ratio 1 to 1,StandardScaler,0.782748,0.763524,0.109079,0.827751,0.192758,173,36,1413,3286
1,Logistic Regression,Ratio 1 to 2,StandardScaler,0.785106,0.743269,0.152715,0.645933,0.247027,135,74,749,3950
2,Logistic Regression,Ratio 1 to 3,StandardScaler,0.802233,0.705486,0.168712,0.526316,0.255517,110,99,542,4157
3,Logistic Regression,Ratio 1 to 4,StandardScaler,0.825032,0.614334,0.189711,0.282297,0.226923,59,150,252,4447
4,Logistic Regression,Ratio 1 to 5,StandardScaler,0.843617,0.607960,0.230435,0.253589,0.241458,53,156,177,4522
...,...,...,...,...,...,...,...,...,...,...,...,...
835,SVC,Ratio 1 to 3,MinMaxScaler,0.799043,0.622311,0.215548,0.291866,0.247967,61,148,222,4477
836,SVC,Ratio 1 to 4,MinMaxScaler,0.839080,0.563206,0.333333,0.138756,0.195946,29,180,58,4641
837,SVC,Ratio 1 to 5,MinMaxScaler,0.857447,0.548163,0.343750,0.105263,0.161172,22,187,42,4657
838,SVC,Ratio 1 to 6,MinMaxScaler,0.869644,0.531790,0.466667,0.066986,0.117155,14,195,16,4683


In [94]:
results_df['Encoder']='OneHotEncoder'

In [95]:
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Confusion TP,Confusion FP,Confusion FN,Confusion TN,Encoder
0,Logistic Regression,Ratio 1 to 1,StandardScaler,0.782748,0.763524,0.109079,0.827751,0.192758,173,36,1413,3286,OneHotEncoder
1,Logistic Regression,Ratio 1 to 2,StandardScaler,0.785106,0.743269,0.152715,0.645933,0.247027,135,74,749,3950,OneHotEncoder
2,Logistic Regression,Ratio 1 to 3,StandardScaler,0.802233,0.705486,0.168712,0.526316,0.255517,110,99,542,4157,OneHotEncoder
3,Logistic Regression,Ratio 1 to 4,StandardScaler,0.825032,0.614334,0.189711,0.282297,0.226923,59,150,252,4447,OneHotEncoder
4,Logistic Regression,Ratio 1 to 5,StandardScaler,0.843617,0.607960,0.230435,0.253589,0.241458,53,156,177,4522,OneHotEncoder
...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,SVC,Ratio 1 to 3,MinMaxScaler,0.799043,0.622311,0.215548,0.291866,0.247967,61,148,222,4477,OneHotEncoder
836,SVC,Ratio 1 to 4,MinMaxScaler,0.839080,0.563206,0.333333,0.138756,0.195946,29,180,58,4641,OneHotEncoder
837,SVC,Ratio 1 to 5,MinMaxScaler,0.857447,0.548163,0.343750,0.105263,0.161172,22,187,42,4657,OneHotEncoder
838,SVC,Ratio 1 to 6,MinMaxScaler,0.869644,0.531790,0.466667,0.066986,0.117155,14,195,16,4683,OneHotEncoder


In [96]:
results_df.to_csv('fidel-test-results.csv', mode='a',index=False, header=False)