In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
TRAIN_FILE_PATH = Path('../data/processed/train_data.csv')
TEST_FILE_PATH = Path('../data/processed/test_data.csv')
if not TRAIN_FILE_PATH.exists() or not TEST_FILE_PATH.exists():
    raise FileNotFoundError(f'File not found')
train_df = pd.read_csv(TRAIN_FILE_PATH)
test_df = pd.read_csv(TEST_FILE_PATH)
print(f'Data successfully loaded with {train_df.shape[0]} rows and {train_df.shape[1]} columns')
print(f'Data successflly loaded with {test_df.shape[0]} rows and {test_df.shape[1]} features')

Data successfully loaded with 533526 rows and 89 columns
Data successflly loaded with 20000 rows and 89 features


In [3]:
# validate dtype of features in the df
for col in train_df.columns:
    if not pd.api.types.is_numeric_dtype(train_df[col]):
        raise ValueError(f'A categorical feature has been detected!')

print(f'No categorical features detected')

# check for missing values in the df
for col in train_df.columns:
    train_df[col] = train_df[col].replace([np.inf, -np.inf],np.nan)
print(f'After: {len(train_df)}')
missing = train_df.isnull().sum().sum()
if missing > 0:
    train_df = train_df.fillna(0)
print(f'Missing: {missing}')
# check for duplicates 
duplicates = train_df.duplicated().sum()
if duplicates > 0:
    train_df.drop_duplicates()
print(f'duplicates: {duplicates}')

No categorical features detected
After: 533526
Missing: 12324
duplicates: 0


In [4]:
# validate dtype of features in the df
for col in test_df.columns:
    if not pd.api.types.is_numeric_dtype(test_df[col]):
        raise ValueError(f'A categorical feature has been detected!')

print(f'No categorical features detected')

# check for missing values in the df
for col in test_df.columns:
    test_df[col] = test_df[col].replace([np.inf, -np.inf],np.nan)
print(f'After: {len(test_df)}')
missing = test_df.isnull().sum().sum()
if missing > 0:
    test_df = test_df.fillna(0)
print(f'Missing: {missing}')
# check for duplicates 
duplicates = test_df.duplicated().sum()
if duplicates > 0:
    test_df.drop_duplicates()
print(f'duplicates: {duplicates}')

No categorical features detected
After: 20000
Missing: 496
duplicates: 0


In [5]:
if 'Churn_Flag' in train_df.columns:
    y_train = train_df['Churn_Flag']
    x_train = train_df.drop(columns=['Churn_Flag']).copy()

else:
    raise ValueError(f'Target variable not found in dataset')

In [6]:
if 'Churn_Flag' in test_df.columns:
    y_test = test_df['Churn_Flag']
    x_test = test_df.drop(columns=['Churn_Flag']).copy()

else:
    raise ValueError(f'Target variable not found in dataset')

In [7]:
def format_duration(seconds):
    if seconds < 60:
        return f'{seconds:2f}s'
    if seconds < 3600:
        minutes = seconds // 60
        sec = seconds % 60
        return f'{minutes}m {sec:2f}s'
    else:
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        sec = seconds % 60
        return f'{hours}h {minutes:2f}m {sec:2f}s' 

In [10]:
# training without feature selection before baseline training

models = {
    'randomforest' : RandomForestClassifier(
        class_weight='balanced',random_state=42,n_jobs=-1
    ),
    'lightghm' : LGBMClassifier(random_state=42, class_weight='balanced',n_jobs=-1),
    'xgboost' : XGBClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    print(f'Training {name}. This may take a while...')
    import time
    start_time = time.time()
    model.fit(x_train, y_train)
    time_elapsed = time.time() - start_time

    # prediction on training set
    y_probs_train = model.predict_proba(x_train)[:,1]
    THRESHOLD = 0.5
    y_pred_train = (y_probs_train >= THRESHOLD).astype(int)

    # prediction on testing set
    y_probs_test = model.predict_proba(x_test)[:,1]
    y_pred_test = (y_probs_test >= THRESHOLD).astype(int)

    from sklearn.metrics import f1_score, log_loss
    # training metrics
    f1_score_train = f1_score(y_train, y_pred_train)
    log_loss_train = log_loss(y_train, y_probs_train)

    # testing metrics
    f1_score_test = f1_score(y_test, y_pred_test)
    log_loss_test = log_loss(y_test, y_probs_test)

    results[name] = {
        'f1_score_train' : f1_score_train,
        'log_loss_train' : log_loss_train,
        'f1_score_test' : f1_score_test,
        'log_loss_test' : log_loss_test,
        'variance' : log_loss_test - log_loss_train,
        'time_elapsed' : format_duration(time_elapsed)
    }

result_summary = pd.DataFrame(results).T
result_summary

Training randomforest. This may take a while...
Training lightghm. This may take a while...
[LightGBM] [Info] Number of positive: 342656, number of negative: 190870
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4208
[LightGBM] [Info] Number of data points in the train set: 533526, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training xgboost. This may take a while...


Unnamed: 0,f1_score_train,log_loss_train,f1_score_test,log_loss_test,variance,time_elapsed
randomforest,1.0,0.017732,0.996345,0.051726,0.033994,1.0m 39.004673s
lightghm,0.998862,0.017871,0.999026,0.017265,-0.000607,12.963425s
xgboost,0.999997,0.002569,1.0,0.002552,-1.6e-05,9.822574s
