In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
FILE_PATH = Path('../data/processed/train_data.csv')
if not FILE_PATH.exists():
    raise FileNotFoundError(f'File not found')
df = pd.read_csv(FILE_PATH)
print(f'Data successfully loaded with {df.shape[0]} rows and {df.shape[1]} columns')

Data successfully loaded with 533526 rows and 89 columns


In [3]:
# validate dtype of features in the df
for col in df.columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        raise ValueError(f'A categorical feature has been detected!')

print(f'No categorical features detected')


No categorical features detected


In [20]:
# check for missing values in the df
for col in df.columns:
    df[col] = df[col].replace([np.inf, -np.inf],np.nan)


In [21]:
missing = df.isnull().sum().sum()
if missing > 0:
    df.fillna(0, inplace=True)
print(f'Missing: {missing}')

Missing: 0


In [22]:
# check for duplicates 
duplicates = df.duplicated().sum()
if duplicates > 0:
    df.drop_duplicates()
print(f'duplicates: {duplicates}')

duplicates: 0


In [23]:
if 'Churn_Flag' in df.columns:
    y_train = df['Churn_Flag']
    x_train = df.drop(columns=['Churn_Flag']).copy()

else:
    raise ValueError(f'Target variable not found in dataset')

In [24]:
models = {
    'randomforest' : RandomForestClassifier(
        class_weight='balanced',random_state=42,n_jobs=-1
    ),
    'lightghm' : LGBMClassifier(random_state=42, class_weight='balanced',n_jobs=-1),
    'xgboost' : XGBClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    print(f'Training {name}. This may take a while...')
    model.fit(x_train, y_train)

    y_preds = model.predict(x_train)

    from sklearn.metrics import f1_score
    f1 = f1_score(y_train, y_preds)

    results[name] = {
        'f1_score' : f1
    }

print(results)

Training randomforest. This may take a while...
Training lightghm. This may take a while...
[LightGBM] [Info] Number of positive: 342656, number of negative: 190870
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.236353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4208
[LightGBM] [Info] Number of data points in the train set: 533526, number of used features: 85
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training xgboost. This may take a while...
{'randomforest': {'f1_score': 1.0}, 'lightghm': {'f1_score': 0.9988620344603428}, 'xgboost': {'f1_score': 0.9999970816212178}}
