### Import Libraries

In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import os

### Load Clean DataSet

In [38]:
df=pd.read_csv('clean-healthcare-categorical-dataset-stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4908 entries, 0 to 4907
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   age                4908 non-null   float64
 1   hypertension       4908 non-null   int64  
 2   heart_disease      4908 non-null   int64  
 3   ever_married       4908 non-null   int64  
 4   work_type          4908 non-null   int64  
 5   avg_glucose_level  4908 non-null   float64
 6   bmi                4908 non-null   float64
 7   smoking_status     4908 non-null   int64  
 8   Male               4908 non-null   int64  
 9   Urban              4908 non-null   int64  
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 421.9 KB


## About the DataSet

### **EXTREMELY UNBALANCED**

### The "cleaned and encoded" dataset contains 4908 records
### Only 209 (4.25%) of these are positive for stroke and the rest (95.74%) are negative

### Will try to create models with different less unbalanced samples to run prediction models

### **Fortunately/Unfortunately there might not be enough positive cases to effectively predict**

### Training Models

In [39]:
models={}
models['Logistic Regression']=LogisticRegression()
models['Random Forest Clasifier']=RandomForestClassifier()
models['SVC']=SVC()

### DataFrame Dictionary with Proportional Positive and negative Samples

In [40]:
def create_sample_dfs_dictionary(df,number_of_samples):
    DataFrames={}
    Positives=df.loc[df['stroke']==df['stroke'].max()]
    for i in range(1,number_of_samples+1):
        Negative_sample=df.loc[df['stroke']==df['stroke'].min()] \
        .sample(len(Positives)*i,ignore_index=True)
        DataFrames[f'Ratio 1 to {i}']= pd.concat([Positives,Negative_sample], axis=0)
    return DataFrames

### Print Explained Confusion Matrix

In [41]:
def print_confusion_matrix(y_test,prediction,Labels):
    cm=confusion_matrix(y_test,prediction,labels=Labels)
    print(f'\033[1mConfusion Matrix\033[0m')
    print(f'Predicted\\Actual\tPositive\tNegative')
    print(f'Positive({cm[0][0]+cm[0][1]})\t\t{cm[0][0]}\t\t{cm[0][1]}')
    print(f'Negative({cm[1][0]+cm[1][1]})\t\t{cm[1][0]}\t\t{cm[1][1]}')

### RandomForestClassifier best depth calculator

In [42]:
def best_depth(X_train,X_test,y_train,y_test):
    train_score=0
    depth=0
    best=0
    hight_score=0
    while train_score<1:
        depth+=1
        clf = RandomForestClassifier(max_depth = depth)
        clf.fit(X_train, y_train)
        train_score = clf.score(X_train,y_train)
        test_score = clf.score(X_test,y_test)
        score=test_score*(1-(train_score-test_score))
        if hight_score<=score:
            hight_score=score
            best=depth
    return best      


### Create/Update a file with the results

In [43]:
def append_results(df):
    if os.path.isfile('fidel-test-results-LabelEncoded.csv'):
        df.to_csv('fidel-test-results-LabelEncoded.csv', mode='a',index=False, header=False )
    else:
        df.to_csv('fidel-test-results-LabelEncoded.csv', mode='w',index=False )


### **train_test_split and ALL the tests**

In [44]:
def Test_Results(df,compression,number_of_samples):    
    model_results={'Test_Model':[], 'Proportion':[], 'Compression':[], 'Model_Score':[],\
                    'Balanced Accuracy':[], 'Precision':[],'Recall':[],'f1-score':[],\
                    'Confusion TP':[], 'Confusion FP':[], 'Confusion FN':[],'Confusion TN':[]}
    X_available=df.copy().drop(columns='stroke')
    y_available=df['stroke']
    tab='   '
    DataFrames=create_sample_dfs_dictionary(df,number_of_samples)
    for name, Learning_model in models.items():    
        # print(f'\033[94m\033[1m{name}\033[0m')
        for key,df in DataFrames.items():
            X=df.copy().drop(columns='stroke')
            y=df['stroke']
            X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
            if name=='Random Forest Clasifier':
                Depth=best_depth(X_train,X_test,y_train,y_test)
                Learning_model=RandomForestClassifier(max_depth=Depth)
            model=Learning_model
            model.fit(X_train,y_train)
            available_predict=model.predict(X_available)
            available_balanced_accuracy=balanced_accuracy_score(y_available, available_predict)
            acr=classification_report(y_available, available_predict,
                                      labels=[1,0],output_dict=True)
            cm=confusion_matrix(y_available,available_predict,labels=[1,0])
            model_results['Test_Model'].append(name)
            model_results['Proportion'].append(key)
            model_results['Compression'].append(compression)
            model_results['Model_Score'].append(model.score(X_train,y_train))
            model_results['Balanced Accuracy'].append(available_balanced_accuracy)
            model_results['Precision'].append(acr['1']['precision'])
            model_results['Recall'].append(acr['1']['recall'])
            model_results['f1-score'].append(acr['1']['f1-score'])
            model_results['Confusion TP'].append(cm[0][0])
            model_results['Confusion FP'].append(cm[0][1])
            model_results['Confusion FN'].append(cm[1][0])
            model_results['Confusion TN'].append(cm[1][1])
    
    model_results_df=pd.DataFrame(model_results)
    return(model_results_df)

### StandardScaler Scaled

In [45]:
ss=StandardScaler()
ss_scaled=ss.fit_transform(df)
ss_df=pd.DataFrame(ss_scaled, columns=df.columns)
ss_df

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,Male,Urban,stroke
0,1.069938,-0.318102,4.381499,0.729270,-0.155713,2.777797,0.981145,-0.351828,1.200240,0.985436,4.741651
1,1.646336,-0.318102,4.381499,0.729270,-0.155713,0.014016,0.459086,0.585108,1.200240,-1.014779,4.741651
2,0.271847,-0.318102,-0.228232,0.729270,-0.155713,1.484266,0.701016,1.522044,-0.833166,0.985436,4.741651
3,1.601998,3.143642,-0.228232,0.729270,0.759543,1.549325,-0.623231,0.585108,-0.833166,-1.014779,4.741651
4,1.690675,-0.318102,-0.228232,0.729270,-0.155713,1.821493,0.013426,-0.351828,1.200240,0.985436,4.741651
...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.324334,-0.318102,-0.228232,-1.371234,1.674800,-0.049918,-1.310821,-1.288764,-0.833166,-1.014779,-0.210897
4904,1.690675,-0.318102,-0.228232,0.729270,0.759543,0.448045,1.414072,0.585108,-0.833166,0.985436,-0.210897
4905,-0.348890,-0.318102,-0.228232,0.729270,0.759543,-0.502181,0.217156,0.585108,-0.833166,-1.014779,-0.210897
4906,0.360524,-0.318102,-0.228232,0.729270,-0.155713,1.373057,-0.419501,-0.351828,1.200240,-1.014779,-0.210897


### Re-assign 1 and 0 to the stroke results

In [46]:
ss_df['stroke']=ss_df['stroke'].apply(lambda x: 1 if x>0 else 0)
ss_df['stroke'].value_counts()

stroke
0    4699
1     209
Name: count, dtype: int64

In [55]:
for i in range(1,21):
    append_results(Test_Results(ss_df,'StandardScaler',7))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

### MinMaxScaler

In [48]:
mms=MinMaxScaler()
mms_scaled=mms.fit_transform(df)
mms_df=pd.DataFrame(mms_scaled, columns=df.columns)
mms_df

Unnamed: 0,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,Male,Urban,stroke
0,0.816895,0.0,1.0,1.0,0.50,0.801265,0.301260,0.333333,1.0,1.0,1.0
1,0.975586,0.0,1.0,1.0,0.50,0.234512,0.254296,0.666667,1.0,0.0,1.0
2,0.597168,0.0,0.0,1.0,0.50,0.536008,0.276060,1.000000,0.0,1.0,1.0
3,0.963379,1.0,0.0,1.0,0.75,0.549349,0.156930,0.666667,0.0,0.0,1.0
4,0.987793,0.0,0.0,1.0,0.50,0.605161,0.214204,0.333333,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
4903,0.157715,0.0,0.0,0.0,1.00,0.221402,0.095074,0.000000,0.0,0.0,0.0
4904,0.987793,0.0,0.0,1.0,0.75,0.323516,0.340206,0.666667,0.0,1.0,0.0
4905,0.426270,0.0,0.0,1.0,0.75,0.128658,0.232532,0.666667,0.0,0.0,0.0
4906,0.621582,0.0,0.0,1.0,0.50,0.513203,0.175258,0.333333,1.0,0.0,0.0


In [49]:
for i in range(1,21):
    append_results(Test_Results(mms_df,'MinMaxScaler',7))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [50]:
results_df=pd.read_csv('fidel-test-results-LabelEncoded.csv')
results_df

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Confusion TP,Confusion FP,Confusion FN,Confusion TN
0,Logistic Regression,Ratio 1 to 1,StandardScaler,0.763578,0.767520,0.115020,0.813397,0.201541,170,39,1308,3391
1,Logistic Regression,Ratio 1 to 2,StandardScaler,0.768085,0.737053,0.157576,0.622010,0.251451,130,79,695,4004
2,Logistic Regression,Ratio 1 to 3,StandardScaler,0.807018,0.711815,0.180195,0.531100,0.269091,111,98,505,4194
3,Logistic Regression,Ratio 1 to 4,StandardScaler,0.821201,0.610511,0.217391,0.263158,0.238095,55,154,198,4501
4,Logistic Regression,Ratio 1 to 5,StandardScaler,0.850000,0.586272,0.216080,0.205742,0.210784,43,166,156,4543
...,...,...,...,...,...,...,...,...,...,...,...,...
1255,SVC,Ratio 1 to 3,MinMaxScaler,0.800638,0.622359,0.181303,0.306220,0.227758,64,145,289,4410
1256,SVC,Ratio 1 to 4,MinMaxScaler,0.846743,0.593768,0.231156,0.220096,0.225490,46,163,153,4546
1257,SVC,Ratio 1 to 5,MinMaxScaler,0.853191,0.532110,0.518519,0.066986,0.118644,14,195,13,4686
1258,SVC,Ratio 1 to 6,MinMaxScaler,0.867821,0.522221,0.384615,0.047847,0.085106,10,199,16,4683


In [51]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1260 entries, 0 to 1259
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Test_Model         1260 non-null   object 
 1   Proportion         1260 non-null   object 
 2   Compression        1260 non-null   object 
 3   Model_Score        1260 non-null   float64
 4   Balanced Accuracy  1260 non-null   float64
 5   Precision          1260 non-null   float64
 6   Recall             1260 non-null   float64
 7   f1-score           1260 non-null   float64
 8   Confusion TP       1260 non-null   int64  
 9   Confusion FP       1260 non-null   int64  
 10  Confusion FN       1260 non-null   int64  
 11  Confusion TN       1260 non-null   int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 118.3+ KB


In [52]:
results_df.sort_values('f1-score',ascending=False)

Unnamed: 0,Test_Model,Proportion,Compression,Model_Score,Balanced Accuracy,Precision,Recall,f1-score,Confusion TP,Confusion FP,Confusion FN,Confusion TN
94,Random Forest Clasifier,Ratio 1 to 4,StandardScaler,0.909323,0.730498,0.328173,0.507177,0.398496,106,103,217,4482
495,Random Forest Clasifier,Ratio 1 to 6,StandardScaler,0.914312,0.659590,0.486111,0.334928,0.396601,70,139,74,4625
74,Random Forest Clasifier,Ratio 1 to 5,StandardScaler,0.915957,0.684945,0.393365,0.397129,0.395238,83,126,128,4571
619,Random Forest Clasifier,Ratio 1 to 4,StandardScaler,0.883780,0.668411,0.376238,0.363636,0.369830,76,133,126,4573
829,Random Forest Clasifier,Ratio 1 to 4,MinMaxScaler,0.891443,0.694456,0.314685,0.430622,0.363636,90,119,196,4503
...,...,...,...,...,...,...,...,...,...,...,...,...
326,Random Forest Clasifier,Ratio 1 to 5,MinMaxScaler,0.841489,0.500000,0.000000,0.000000,0.000000,0,209,0,4699
683,Random Forest Clasifier,Ratio 1 to 5,MinMaxScaler,0.841489,0.500000,0.000000,0.000000,0.000000,0,209,0,4699
684,Random Forest Clasifier,Ratio 1 to 6,MinMaxScaler,0.860529,0.500000,0.000000,0.000000,0.000000,0,209,0,4699
748,Random Forest Clasifier,Ratio 1 to 7,MinMaxScaler,0.874801,0.500000,0.000000,0.000000,0.000000,0,209,0,4699


In [53]:
results_df['Encoder']='LabelEncoder'

In [54]:
results_df.to_csv('fidel-test-results.csv', mode='w',index=False )