In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [2]:
df = pd.read_csv('01_dataset.csv', index_col='customer_id')

In [3]:
X = df.drop(columns='label')
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

In [14]:
param = {'num_leaves': 100, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 100
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])

[LightGBM] [Info] Number of positive: 2908, number of negative: 13092
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10153
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 122
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.181750 -> initscore=-1.504536
[LightGBM] [Info] Start training from score -1.504536


In [15]:
ybar = bst.predict(X_train)
ybar = [1 if x >= 0.5 else 0 for x in ybar]
ypred = bst.predict(X_test)
ypred = [1 if x >= 0.5 else 0 for x in ypred]
print('Accuracy for train data: {}%'.format(accuracy_score(y_train, ybar)*100))
print('Accuracy for test data: {}%'.format(accuracy_score(y_test, ypred)*100))

Accuracy for train data: 99.7875%
Accuracy for test data: 88.47500000000001%


In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
ybar = model.predict(X_train)
ypred = model.predict(X_test)
print('Accuracy for train data: {}%'.format(accuracy_score(y_train, ybar)*100))
print('Accuracy for test data: {}%'.format(accuracy_score(y_test, ypred)*100))

Accuracy for train data: 99.97500000000001%
Accuracy for test data: 88.675%


In [11]:
model = xgb.XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
ybar = model.predict(X_train)
ypred = model.predict(X_test)
print('Accuracy for train data: {}%'.format(accuracy_score(y_train, ybar)*100))
print('Accuracy for test data: {}%'.format(accuracy_score(y_test, ypred)*100))

Accuracy for train data: 98.85000000000001%
Accuracy for test data: 88.02499999999999%
