<h1>Classification algorithms<h1/>

<h2>Imports</h2>

In [2]:
import xgboost as xgb
import lightgbm as lgb
from parameters import *
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import confusion_matrix, average_precision_score, f1_score
import pandas as pd
import pickle

<h2>Loading data</h2>

In [13]:
classes = pd.read_csv('../data/elliptic_txs_classes.csv')
features = pd.read_csv('../data/elliptic_txs_features.csv', header=None)

data = features.merge(classes, left_on=0, right_on='txId', how='left')
data = data.loc[data['class'] != 'unknown']

y = (data['class'].astype(int)-2)*(-1)
X = data.drop([1, 'txId', 0, 'class'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h3>Extreme Gradient Boost</h3>

In [14]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

xgb_model = xgb.train(parameter_xgb, dtrain)

y_pred = xgb_model.predict(dtest)
print(confusion_matrix(y_test, (y_pred > 0.5).astype(int)))
print(average_precision_score(y_test, y_pred))
print(f1_score(y_test, (y_pred > 0.5).astype(int)))

pickle.dump(xgb_model, open('../models_result/xgboost' + '.sav', 'wb'))

[[12577    10]
 [  217  1166]]
0.9653001068834757
0.9112934740132863


In [12]:
print(average_precision_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, (y_pred > 0.5).astype(int)))

0.9653001068834757
0.9112934740132863


<h3>Cat Boost Classifier</h3>

In [15]:
cat_model = CatBoostClassifier()
cat_model.set_params(**parameter_cat)
cat_model.fit(X_train, y_train, verbose=False)

y_pred = cat_model.predict(X_test)
print(confusion_matrix(y_test, (y_pred > 0.5).astype(int)))
print(average_precision_score(y_test, y_pred))
print(f1_score(y_test, (y_pred > 0.5).astype(int)))

pickle.dump(cat_model, open('../models_result/catboost' + '.sav', 'wb'))

[[12581     6]
 [  120  1263]]
0.9175040571939735
0.9524886877828054


<h3>Light Gradient Boost</h3>

In [22]:
train = lgb.Dataset(X_train, label=y_train)

lgb_model = lgb.train(parameter_lgb, train)

ypred = lgb_model.predict(X_test)
print(confusion_matrix(y_test, (ypred > 0.5).astype(int)))

pickle.dump(lgb_model, open('../models_result/lgboost' + '.sav', 'wb'))

Please use silent argument of the Dataset constructor to pass this parameter.


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38638
[LightGBM] [Info] Number of data points in the train set: 31197, number of used features: 165
[LightGBM] [Info] Start training from score 0.902875
[[ 1332   183]
 [   13 13839]]
