Importing Packages

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import time
from sklearn.metrics import confusion_matrix

from mpl_toolkits.mplot3d import Axes3D

from sklearn.metrics import silhouette_score, silhouette_samples
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture

import itertools

import scipy

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Data ingestion

Convert csvs to Dataframes

In [24]:
invoice_test = pd.read_csv('invoice_test.csv', low_memory=False)
invoice_train = pd.read_csv('invoice_train.csv', low_memory=False)
client_test = pd.read_csv('client_test.csv', low_memory=False)
client_train = pd.read_csv('client_train.csv', low_memory=False)
sample_submission = pd.read_csv('SampleSubmission.csv', low_memory=False)

print(invoice_test.shape, invoice_train.shape, client_test.shape, client_train.shape)

(887059, 16) (864499, 16) (58069, 5) (135493, 6)


Data Exploration: This page: https://zindi.africa/competitions/ai-hack-tunisia-4-predictive-analytics-challenge-1/data has the data dictionary. I'm using "target" as my label.

In [25]:
client_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135493 entries, 0 to 135492
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   disrict        135493 non-null  int64  
 1   client_id      135493 non-null  object 
 2   client_catg    135493 non-null  int64  
 3   region         135493 non-null  int64  
 4   creation_date  135493 non-null  object 
 5   target         135493 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 6.2+ MB


In [26]:
invoice_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864499 entries, 0 to 864498
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   client_id             864499 non-null  object
 1   invoice_date          864499 non-null  object
 2   tarif_type            864499 non-null  int64 
 3   counter_number        864499 non-null  int64 
 4   counter_statue        864499 non-null  int64 
 5   counter_code          864499 non-null  int64 
 6   reading_remarque      864499 non-null  int64 
 7   counter_coefficient   864499 non-null  int64 
 8   consommation_level_1  864499 non-null  int64 
 9   consommation_level_2  864499 non-null  int64 
 10  consommation_level_3  864499 non-null  int64 
 11  consommation_level_4  864499 non-null  int64 
 12  old_index             864499 non-null  int64 
 13  new_index             864499 non-null  int64 
 14  months_number         864499 non-null  int64 
 15  counter_type     

Feature engineering

Recoding certain columns, formats, etc.

In [27]:
def feature_change(cl, inv):

    cl['client_catg'] = cl['client_catg'].astype('category')
    cl['disrict'] = cl['disrict'].astype('category')
    cl['region'] = cl['region'].astype('category')
    cl['region_group'] = cl['region'].apply(lambda x: 100 if x<100 else 300 if x>300 else 200)
    cl['creation_date'] = pd.to_datetime(cl['creation_date'])
    
    cl['coop_time'] = (2019 - cl['creation_date'].dt.year)*12 - cl['creation_date'].dt.month

    inv['counter_type'] = inv['counter_type'].map({"ELEC":1,"GAZ":0})
    inv['counter_statue'] = inv['counter_statue'].map({0:0,1:1,2:2,3:3,4:4,5:5,769:5,'0':0,'5':5,'1':1,'4':4,'A':0,618:5,269375:5,46:5,420:5})
    
    inv['invoice_date'] = pd.to_datetime(inv['invoice_date'], dayfirst=True)
    inv['invoice_month'] = inv['invoice_date'].dt.month
    inv['invoice_year'] = inv['invoice_date'].dt.year
    inv['is_weekday'] = ((pd.DatetimeIndex(inv.invoice_date).dayofweek) // 5 == 1).astype(float)
    inv['delta_index'] = inv['new_index'] - inv['old_index']
    
    return cl, inv

I apply my feature changes to the training and test data. After this point, I don't touch my testing data to prevent leakeage.

In [None]:
client_train1, invoice_train1 = feature_change(client_train, invoice_train)
client_test1, invoice_test1 = feature_change(client_test, invoice_test)

I write a function to create aggregate features

In [29]:
def agg_feature(invoice, client_df, agg_stat):
    
    invoice['delta_time'] = invoice.sort_values(['client_id','invoice_date']).groupby('client_id')['invoice_date'].diff().dt.days.reset_index(drop=True)
    agg_trans = invoice.groupby('client_id')[agg_stat+['delta_time']].agg(['mean','std','min','max'])
    
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = invoice.groupby('client_id').size().reset_index(name='transactions_count')
    agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')
    
    weekday_avg = invoice.groupby('client_id')[['is_weekday']].agg(['mean'])
    weekday_avg.columns = ['_'.join(col).strip() for col in weekday_avg.columns.values]
    weekday_avg.reset_index(inplace=True)
    client_df = pd.merge(client_df, weekday_avg, on='client_id', how='left')
    
    full_df = pd.merge(client_df, agg_trans, on='client_id', how='left')
    
    full_df['invoice_per_cooperation'] = full_df['transactions_count'] / full_df['coop_time']
    
    return full_df

In [30]:
aggs = {}
aggs['consommation_level_1'] = ['mean']
aggs['consommation_level_2'] = ['mean']
aggs['consommation_level_3'] = ['mean']
aggs['consommation_level_4'] = ['mean']
agg_trans = invoice_train.groupby(['client_id', 'counter_type']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

agg_trans1 = agg_trans

aggs = {}
aggs['consommation_level_1_mean'] = ['mean']
aggs['consommation_level_2_mean'] = ['mean']
aggs['consommation_level_3_mean'] = ['mean']
aggs['consommation_level_4_mean'] = ['mean']
agg_trans = agg_trans1.groupby(['client_id']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (invoice_train.groupby('client_id')
          .size()
          .reset_index(name='{}transactions_count'.format('1')))
agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')

In [31]:
agg_stat_columns = [
 'tarif_type',
 'counter_number',
 'counter_statue',
 'counter_code',
 'reading_remarque',
 'consommation_level_1',
 'consommation_level_2',
 'consommation_level_3',
 'consommation_level_4',
 'old_index',
 'new_index',
 'months_number',
 'counter_type',
 'invoice_month',
 'invoice_year',
 'delta_index'
]

train_df1 = agg_feature(invoice_train1, client_train1, agg_stat_columns)
test_df1 = agg_feature(invoice_test1, client_test1, agg_stat_columns)

I merge my dataframe with my new aggregate function

In [32]:
train_df1 = pd.merge(train_df1,agg_trans, on='client_id', how='left')

test_df1 = pd.merge(test_df1,agg_trans, on='client_id', how='left')

print(train_df1.columns)

Index(['disrict', 'client_id', 'client_catg', 'region', 'creation_date',
       'target', 'region_group', 'coop_time', 'is_weekday_mean',
       'transactions_count', 'tarif_type_mean', 'tarif_type_std',
       'tarif_type_min', 'tarif_type_max', 'counter_number_mean',
       'counter_number_std', 'counter_number_min', 'counter_number_max',
       'counter_statue_mean', 'counter_statue_std', 'counter_statue_min',
       'counter_statue_max', 'counter_code_mean', 'counter_code_std',
       'counter_code_min', 'counter_code_max', 'reading_remarque_mean',
       'reading_remarque_std', 'reading_remarque_min', 'reading_remarque_max',
       'consommation_level_1_mean', 'consommation_level_1_std',
       'consommation_level_1_min', 'consommation_level_1_max',
       'consommation_level_2_mean', 'consommation_level_2_std',
       'consommation_level_2_min', 'consommation_level_2_max',
       'consommation_level_3_mean', 'consommation_level_3_std',
       'consommation_level_3_min', 'consomma

One more feature engineering step, I create a range and max mean column

In [33]:
def new_features(df):
    
    for col in agg_stat_columns:
        df[col+'_range'] = df[col+'_max'] - df[col+'_min']
        df[col+'_max_mean'] = df[col+'_max']/df[col+'_mean']
    
    return df

In [34]:
train_df2 = new_features(train_df1)
test_df2 = new_features(test_df1)

In [35]:
print('Initial number of columns: ', len(client_train.columns)+len(invoice_train.columns))
print('Number of columns now: ', len(train_df2.columns))

Initial number of columns:  29
Number of columns now:  116


In [36]:
def drop(df):

    col_drop = ['client_id', 'creation_date']
    for col in col_drop:
        df.drop([col], axis=1, inplace=True)
    return df

In [37]:
train_df = drop(train_df2)
test_df = drop(test_df2)

In [38]:
y = train_df['target']
X = train_df.drop('target',axis=1)

feature_name = X.columns.tolist()

Drop irrelevant columns

In [39]:
drop_col=['reading_remarque_max','counter_statue_min','counter_type_min','counter_type_max','counter_type_range',
          'tarif_type_max', 'delta_index_min', 'consommation_level_4_mean']

X = X.drop(drop_col, axis=1)
test_df = test_df.drop(drop_col, axis=1)

Hyperparameter tuning

In [40]:
from optuna import Trial
import gc
import optuna
from sklearn.model_selection import train_test_split

import lightgbm as lgb

def objective(trial:Trial):
    
    gc.collect()
    models=[]
    validScore=0
   
    model,log = fitLGBM(trial,X,y)
    
    models.append(model)
    gc.collect()
    validScore+=log
    validScore/=len(models)
    
    return validScore

In [41]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

def fitLGBM(trial,X, y):
    
    params={
        'n_estimators':trial.suggest_int('n_estimators', 0, 1000),
        'num_leaves':trial.suggest_int('num_leaves', 2, 512),
        'max_depth':trial.suggest_int('max_depth', 2, 128),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.5),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.001, 0.1),
        'feature_fraction':trial.suggest_uniform('feature_fraction',0.1, 1.0),
        'bagging_freq':trial.suggest_int('bagging_freq',0.1,10),
        'verbosity': -1,
            }
    stkfold = StratifiedKFold(n_splits=5, shuffle=True)
    model = LGBMClassifier(**params)
    
    res=[]
    for i, (tdx, vdx) in enumerate(stkfold.split(X, y)):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_valid, y_valid)],
                 early_stopping_rounds=30)
        preds = model.predict_proba(X_valid)
        res.append(roc_auc_score(y_valid, preds[:,1]))
    
    err = np.mean(res)
    
    return model, err

In [42]:
study = optuna.create_study(direction='maximize', sampler = None, pruner = None)
study.optimize(objective, n_jobs=2, show_progress_bar = True)

print()
print("Best params:")
print(study.best_params)

[300]	training's binary_logloss: 0.182041	valid_1's binary_logloss: 0.200558
[41]	training's binary_logloss: 0.207979	valid_1's binary_logloss: 0.210896
[301]	training's binary_logloss: 0.181994	valid_1's binary_logloss: 0.200547
[42]	training's binary_logloss: 0.20783	valid_1's binary_logloss: 0.210808
[302]	training's binary_logloss: 0.181948	valid_1's binary_logloss: 0.20054
[43]	training's binary_logloss: 0.207682	valid_1's binary_logloss: 0.21072
[303]	training's binary_logloss: 0.181902	valid_1's binary_logloss: 0.200536
[44]	training's binary_logloss: 0.207537	valid_1's binary_logloss: 0.210632
[304]	training's binary_logloss: 0.181855	valid_1's binary_logloss: 0.200522
[45]	training's binary_logloss: 0.207391	valid_1's binary_logloss: 0.210545
[305]	training's binary_logloss: 0.181808	valid_1's binary_logloss: 0.200513
[46]	training's binary_logloss: 0.207246	valid_1's binary_logloss: 0.210461
[306]	training's binary_logloss: 0.181762	valid_1's binary_logloss: 0.200504
[47]	tra

[32m[I 2022-08-25 16:42:12,220][0m Trial 13 finished with value: 0.7067501323033507 and parameters: {'n_estimators': 360, 'num_leaves': 183, 'max_depth': 92, 'learning_rate': 0.004784825503565179, 'min_split_gain': 0.018240148242843462, 'feature_fraction': 0.9948916043934378, 'bagging_freq': 1}. Best is trial 1 with value: 0.7079213160266236.[0m


[106]	training's binary_logloss: 0.200445	valid_1's binary_logloss: 0.206781
[107]	training's binary_logloss: 0.200352	valid_1's binary_logloss: 0.206738
[108]	training's binary_logloss: 0.200257	valid_1's binary_logloss: 0.206691
[109]	training's binary_logloss: 0.200162	valid_1's binary_logloss: 0.206645
[110]	training's binary_logloss: 0.200069	valid_1's binary_logloss: 0.206598
[111]	training's binary_logloss: 0.199978	valid_1's binary_logloss: 0.206556
[112]	training's binary_logloss: 0.199885	valid_1's binary_logloss: 0.206512
[113]	training's binary_logloss: 0.199794	valid_1's binary_logloss: 0.206464
[114]	training's binary_logloss: 0.199704	valid_1's binary_logloss: 0.206422
[115]	training's binary_logloss: 0.199614	valid_1's binary_logloss: 0.206381
[116]	training's binary_logloss: 0.199523	valid_1's binary_logloss: 0.206342
[117]	training's binary_logloss: 0.19943	valid_1's binary_logloss: 0.206295
[118]	training's binary_logloss: 0.199339	valid_1's binary_logloss: 0.20625
[



[1]	training's binary_logloss: 0.215129	valid_1's binary_logloss: 0.215196
[2]	training's binary_logloss: 0.214893	valid_1's binary_logloss: 0.215043
[3]	training's binary_logloss: 0.214661	valid_1's binary_logloss: 0.214894
[4]	training's binary_logloss: 0.214434	valid_1's binary_logloss: 0.214753
[5]	training's binary_logloss: 0.214211	valid_1's binary_logloss: 0.214614
[6]	training's binary_logloss: 0.214002	valid_1's binary_logloss: 0.21448
[7]	training's binary_logloss: 0.213781	valid_1's binary_logloss: 0.214343
[8]	training's binary_logloss: 0.213565	valid_1's binary_logloss: 0.214205
[9]	training's binary_logloss: 0.213359	valid_1's binary_logloss: 0.214078
[10]	training's binary_logloss: 0.213149	valid_1's binary_logloss: 0.213944
[11]	training's binary_logloss: 0.212943	valid_1's binary_logloss: 0.213811
[12]	training's binary_logloss: 0.21274	valid_1's binary_logloss: 0.213686
[13]	training's binary_logloss: 0.212538	valid_1's binary_logloss: 0.213559
[14]	training's binary_

[32m[I 2022-08-25 16:42:29,825][0m Trial 14 finished with value: 0.7054562595282902 and parameters: {'n_estimators': 348, 'num_leaves': 180, 'max_depth': 89, 'learning_rate': 0.0032375959903458004, 'min_split_gain': 0.01604239048031275, 'feature_fraction': 0.9854252900019601, 'bagging_freq': 0}. Best is trial 1 with value: 0.7079213160266236.[0m


KeyboardInterrupt: 

In [43]:
model = LGBMClassifier(n_estimators=672, num_leaves=46, max_depth=125, 
                       learning_rate=0.018141379194639352, min_split_gain=0.05197891962284165, 
                       feature_fraction=0.545050546948007, bagging_freq=2)

stkfold = StratifiedKFold(n_splits=5, shuffle=True)

def calc(X, y, model, cv):
    res=[]
    local_probs=pd.DataFrame()
    probs = pd.DataFrame()

    for i, (tdx, vdx) in enumerate(cv.split(X, y)):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y[tdx], y[vdx]
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_valid, y_valid)],
                 early_stopping_rounds=30, verbose=False)
        
        preds = model.predict_proba(X_valid)
        oof_predict = model.predict_proba(test_df)
        local_probs['fold_%i'%i] = oof_predict[:,1]
        res.append(roc_auc_score(y_valid, preds[:,1]))

    print('ROC AUC:', round(np.mean(res), 6))    
    local_probs['res'] = local_probs.mean(axis=1)
    probs['target'] = local_probs['res']
    
    return probs

In [44]:
clf_all = LGBMClassifier(n_estimators=672, num_leaves=46, max_depth=125, 
                       learning_rate=0.018141379194639352, min_split_gain=0.05197891962284165, 
                       feature_fraction=0.545050546948007, bagging_freq=2)
clf_all.fit(X, y)

y_pred_dt = clf_all.predict(X)



LGBMClassifier(bagging_freq=2, feature_fraction=0.545050546948007,
               learning_rate=0.018141379194639352, max_depth=125,
               min_split_gain=0.05197891962284165, n_estimators=672,
               num_leaves=46)

In [45]:
from sklearn.metrics import classification_report

#print(classification_report(y, y_pred_dt, target_names=class_names))
print(confusion_matrix(y, y_pred_dt))

[[127925      2]
 [  6544   1022]]


In [46]:
from sklearn.metrics import classification_report
class_names = [str(x) for x in clf_all.classes_]
print(classification_report(y, y_pred_dt, target_names=class_names))

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98    127927
         1.0       1.00      0.14      0.24      7566

    accuracy                           0.95    135493
   macro avg       0.97      0.57      0.61    135493
weighted avg       0.95      0.95      0.93    135493



In [48]:
%%time
probs = calc(X, y, model, stkfold)























ROC AUC: 0.71144
CPU times: total: 9min 27s
Wall time: 21 s


In [49]:
submission = pd.DataFrame({
        "client_id": sample_submission["client_id"],
        "target": probs['target']
    })
submission.to_csv('submission.csv', index=False)

In [50]:
print(probs['target'])

0        0.036125
1        0.217057
2        0.030098
3        0.007280
4        0.051619
           ...   
58064    0.034047
58065    0.064392
58066    0.042540
58067    0.016612
58068    0.075902
Name: target, Length: 58069, dtype: float64
