# CS5284 Group07 - XGBoost

In [73]:
# To run certain networkX function in parallel
# !nvcc --version
!pip install nx-cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
%env NX_CUGRAPH_AUTOCONFIG=True

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
env: NX_CUGRAPH_AUTOCONFIG=True


In [74]:
# Import
import pandas as pd
import networkx as nx
from xgboost import DMatrix, train
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

nx.config.warnings_to_ignore.add("cache") # Ignore networkx cugraph warnings

# Tabular Features Only

## Data Preparation

In [75]:
# Load data from CSV file
# data = pd.read_csv('elliptic_bitcoin_dataset_50K_samples_final.csv', header=None)
data = pd.read_csv('elliptic_txs_features_labelled_only.csv', header=None, low_memory=False)

# Separate features and target
X = data.iloc[:, 1:-1]   # all columns except last and first column (txn id)
Y = data.iloc[:, -1]    # last column

# Remap Y labels to be either 0 or 1
Y = Y.map({2: 0, 1: 1})

In [76]:
# Split into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)

In [77]:
# Create DMatrix from X and Y
dtrain = DMatrix(X_train, Y_train)
dtest = DMatrix(X_test, Y_test)

## Train Model

In [78]:
# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'random_state': 1
}

In [79]:
# Train the model
model = train(params, dtrain)

## Test Model

In [80]:
# Predict
preds = model.predict(dtest) # output a probability
preds_binary = (preds > 0.5).astype(int) # convert back to binary for comparison

# Calculate error
acc = accuracy_score(Y_test, preds_binary)
prec = precision_score(Y_test, preds_binary)
rec = recall_score(Y_test, preds_binary)
f1  = f1_score(Y_test, preds_binary)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)
print("F1 Score: ", f1)

Accuracy:  0.9750880508547376
Precision:  0.9976133651551312
Recall:  0.7437722419928826
F1 Score:  0.8521916411824668


# Tabular + Graph Features

In [81]:
# Load data from CSV file
data = pd.read_csv('elliptic_txs_features_labelled_only_with_graph_feature.csv', header=None, low_memory=False)
edgelist = pd.read_csv('elliptic_txs_edgelist_labelled_only.csv', header=None, low_memory=False)

## Prepare Graph Features
- PageRank
- Betweeness Centrality
- In-degree

Note: We already included the in-degree feature in the features dataset

In [82]:
# Page Rank
graph = nx.from_pandas_edgelist(edgelist, source=0, target=1, create_using=nx.DiGraph())
pageRankScores = nx.pagerank(graph)

print(pageRankScores)

betweennessScores = nx.betweenness_centrality(graph, k=10000, seed=1) # Take approx ~1/3 of nodes as sample
print(betweennessScores)

# Add to existing DF
data.insert(len(data.columns) - 1, "pagerank", '')
data["pagerank"] = data[0].map(pageRankScores)

data.insert(len(data.columns) - 1, "betweenness", '')
data["betweenness"] = data[0].map(betweennessScores)
# print(data)

{232344069: 1.1827902174535751e-05, 27553029: 1.689100582566251e-05, 3881097: 1.188712259274715e-05, 232457116: 2.2004725351375804e-05, 232051089: 1.1827902174535751e-05, 232470704: 4.2989398597715243e-05, 230473487: 1.1827902174535751e-05, 7089694: 4.220652408129631e-05, 231182296: 2.2286692149199097e-05, 14660781: 2.1357008409931035e-05, 43358239: 1.1827902174535751e-05, 230528714: 2.1954109476789267e-05, 5488136: 2.2004725351375804e-05, 27553759: 3.064503312639492e-05, 231181128: 2.1954109476789267e-05, 232453404: 3.0601258959444824e-05, 230423018: 2.1954109476789267e-05, 232392936: 2.1214580566990286e-05, 89273: 1.689100582566251e-05, 3906238: 1.188712259274715e-05, 230973596: 1.188712259274715e-05, 232455956: 2.2004725351375804e-05, 230423318: 1.1827902174535751e-05, 232453639: 1.1827902174535751e-05, 232453630: 2.1954109476789267e-05, 51860219: 1.188712259274715e-05, 1913117: 1.1827902174535751e-05, 232909272: 1.2217371686160887e-05, 206222134: 1.1827902174535751e-05, 192431356: 

In [83]:
# Separate features and target
X = data.iloc[:, 1:-1]   # all columns except last and first column (txn id)
Y = data.iloc[:, -1]    # last column (target)

# Remap Y labels to be either 0 or 1
Y = Y.map({2: 0, 1: 1})

In [84]:
# Split into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1)

## Train Model

In [85]:
# Create DMatrix from X and Y
dtrain = DMatrix(X_train, Y_train)
dtest = DMatrix(X_test, Y_test)

In [86]:
# Set XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'random_state': 1
}

In [87]:
# Train the model
model = train(params, dtrain)

## Test Model

In [88]:
# Predict
preds = model.predict(dtest) # output a probability
preds_binary = (preds > 0.5).astype(int) # convert back to binary for comparison

# Calculate error
acc = accuracy_score(Y_test, preds_binary)
prec = precision_score(Y_test, preds_binary)
rec = recall_score(Y_test, preds_binary)
f1  = f1_score(Y_test, preds_binary)
print("Accuracy: ", acc)
print("Precision: ", prec)
print("Recall: ", rec)
print("F1 Score: ", f1)

Accuracy:  0.9779228588609226
Precision:  0.9977037887485649
Recall:  0.7731316725978647
F1 Score:  0.8711779448621554
