In [30]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold


In [18]:
TRAIN_FILE_PATH = Path('../data/processed/train_set.csv')
DEV_FILE_PATH = Path('../data/processed/dev_set.csv')
TEST_FILE_PATH = Path('../data/processed/test_set.csv')
if not TRAIN_FILE_PATH.exists() or not DEV_FILE_PATH.exists():
    raise FileNotFoundError(f'File not found')
train_df = pd.read_csv(TRAIN_FILE_PATH)
dev_df = pd.read_csv(DEV_FILE_PATH)
test_df = pd.read_csv(TEST_FILE_PATH)
print(f'Train Data successfully loaded with {train_df.shape[0]} rows and {train_df.shape[1]} features')
print(f'Dev Data successflly loaded with {dev_df.shape[0]} rows and {dev_df.shape[1]} features')
print(f'Test data successfully loaded with {test_df.shape[0]} rows and {test_df.shape[1]} features')

Train Data successfully loaded with 4500 rows and 18 features
Dev Data successflly loaded with 610 rows and 18 features
Test data successfully loaded with 500 rows and 18 features


In [20]:
# validate dtype of features in the df
for col in train_df.columns:
    if not pd.api.types.is_numeric_dtype(train_df[col]):
        raise ValueError(f'A categorical feature has been detected!')

print(f'No categorical features detected')


No categorical features detected


In [21]:

# check for missing values in the df
for col in train_df.columns:
    train_df[col] = train_df[col].replace([np.inf, -np.inf],np.nan)
print(f'After: {len(train_df)}')
missing = train_df.isnull().sum().sum()
if missing > 0:
    train_df = train_df.fillna(0)
print(f'Missing: {missing}')
# check for duplicates 
duplicates = train_df.duplicated().sum()
if duplicates > 0:
    train_df.drop_duplicates()
print(f'duplicates: {duplicates}')


After: 4500
Missing: 0
duplicates: 0


In [22]:
# validate dtype of features in the df
for col in dev_df.columns:
    if not pd.api.types.is_numeric_dtype(dev_df[col]):
        raise ValueError(f'A categorical feature has been detected!')

print(f'No categorical features detected')

# check for missing values in the df
for col in dev_df.columns:
    dev_df[col] = dev_df[col].replace([np.inf, -np.inf],np.nan)
print(f'After: {len(dev_df)}')
missing = dev_df.isnull().sum().sum()
if missing > 0:
    dev_df = dev_df.fillna(0)
print(f'Missing: {missing}')
# check for duplicates 
duplicates = dev_df.duplicated().sum()
if duplicates > 0:
    dev_df.drop_duplicates()
print(f'duplicates: {duplicates}')


No categorical features detected
After: 610
Missing: 0
duplicates: 0


In [27]:
if 'stroke' in train_df.columns:
    y_train = train_df['stroke']
    x_train = train_df.drop(columns=['stroke']).copy()

else:
    raise ValueError(f'Target variable not found in dataset')
if 'stroke' in dev_df.columns:
    y_dev = dev_df['stroke']
    x_dev = dev_df.drop(columns=['stroke']).copy()

else:
    raise ValueError(f'Target variable not found in dataset')


In [24]:
def format_duration(seconds):
    if seconds < 60:
        return f'{seconds:2f}s'
    if seconds < 3600:
        minutes = seconds // 60
        sec = seconds % 60
        return f'{minutes}m {sec:2f}s'
    else:
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        sec = seconds % 60
        return f'{hours}h {minutes:2f}m {sec:2f}s' 


In [32]:
# training without feature selection before baseline training

models = {
    'Logistic_regression': LogisticRegression(class_weight='balanced',penalty='l2', max_iter=2000, C=3),
    'randomforest' : RandomForestClassifier(
        class_weight='balanced',random_state=42,n_jobs=-1
    ),
    'lightghm' : LGBMClassifier(random_state=42, class_weight='balanced',n_jobs=-1),
    'xgboost' : XGBClassifier(random_state=42)
}
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

results = {}
for name, model in models.items():
    print(f'Training {name}. This may take a while...')
    import time
    start_time = time.time()
    cv_score = cross_val_score(model, x_train, y_train,cv=cv, scoring='accuracy', n_jobs=-1)
    time_elapsed = time.time() - start_time

    # prediction on training set
    # y_probs_train = model.predict_proba(x_train)[:,1]
    # THRESHOLD = 0.5
    # y_pred_train = (y_probs_train >= THRESHOLD).astype(int)

    # # prediction on deving set
    # y_probs_dev = model.predict_proba(x_dev)[:,1]
    # y_pred_dev = (y_probs_dev >= THRESHOLD).astype(int)

    # from sklearn.metrics import f1_score, log_loss
    # # training metrics
    # f1_score_train = f1_score(y_train, y_pred_train)
    # log_loss_train = log_loss(y_train, y_probs_train)

    # # deving metrics
    # f1_score_dev = f1_score(y_dev, y_pred_dev)
    # log_loss_dev = log_loss(y_dev, y_probs_dev)

    # results[name] = {
    #     'f1_score_train' : f1_score_train,
    #     'log_loss_train' : log_loss_train,
    #     'f1_score_dev' : f1_score_dev,
    #     'log_loss_dev' : log_loss_dev,
    #     'variance' : log_loss_dev - log_loss_train,
    #     'time_elapsed' : format_duration(time_elapsed)
    # }
    results[name] = {
        'cv_score': cv_score,
        'cv_score_mean':cv_score.mean(),
        'cv_score_std': cv_score.std(),
        'time_elapsed' : format_duration(time_elapsed)
    }
    
result_summary = pd.DataFrame(results).T
result_summary


Training Logistic_regression. This may take a while...
Training randomforest. This may take a while...
Training lightghm. This may take a while...
Training xgboost. This may take a while...


Unnamed: 0,cv_score,cv_score_mean,cv_score_std,time_elapsed
Logistic_regression,"[0.76, 0.7511111111111111, 0.7066666666666667,...",0.722222,0.02416,0.433703s
randomforest,"[0.9511111111111111, 0.9488888888888889, 0.951...",0.950222,0.002266,2.163208s
lightghm,"[0.9244444444444444, 0.9088888888888889, 0.884...",0.899111,0.011683,2.366437s
xgboost,"[0.9466666666666667, 0.9444444444444444, 0.937...",0.940222,0.006082,0.445544s
