# Model Development for Gora Competition Dataset

## Importing libraries and Loading dataset

In [1]:
import certifi
import os
os.environ['SSL-CERT_FILE'] = certifi.where()
from giza_datasets import DatasetsLoader

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns 
from sklearn.linear_model import LinearRegression 

pd.set_option('display.max_columns', 200)

In [2]:

loader = DatasetsLoader()
df = loader.load("gora-competition-training").to_pandas()
df["address"] = df["address"].str.lower()
df["added_at"] = pd.to_datetime(df["added_at"])
df["first_borrow_date"] = pd.to_datetime(df["first_borrow_date"])
df["calc_start_time"] = pd.to_datetime(df["calc_start_time"])


Dataset read from cache.
Loading dataset gora-competition-training from cache.


## Data Preparation & Preprocessing

In [53]:
from sklearn.preprocessing import StandardScaler
features = ['total_borrow', 'count_borrow', 'avg_borrow_amount',
       'std_borrow_amount', 
       'borrow_amount_cv', 'total_repay', 'count_repay', 'avg_repay_amount',
       'std_repay_amount', 'repay_amount_cv', 'total_deposit', 'count_deposit',
       'avg_deposit_amount', 'std_deposit_amount', 'deposit_amount_cv',
       'total_redeem', 'count_redeem', 'avg_redeem_amount',
       'std_redeem_amount', 'redeem_amount_cv',
       'days_since_first_borrow', 'net_outstanding',
       'int_paid', 'net_deposits', 'count_repays_to_count_borrows',
       'avg_repay_to_avg_borrow', 'net_outstanding_to_total_borrowed',
       'net_outstanding_to_total_repaid', 'count_redeems_to_count_deposits',
       'total_redeemed_to_total_deposits', 'avg_redeem_to_avg_deposit',
       'net_deposits_to_total_deposits', 'net_deposits_to_total_redeemed',
       'dex_total_sum_added',
       'dex_total_sum_removed', 'dex_total_sum_swapped']

X = df[features].values
y_reg = np.log1p(df["total_liquidation_to_total_borrow"].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


saving scaler for using on the evaluation dataset

In [59]:
import joblib 

joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

## Model Training

In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class CustomDataset(TensorDataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float)
        self.y = torch.tensor(y, dtype=torch.float)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 256
dataset = CustomDataset(X_train_scaled, y_train)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(X_test_scaled, y_test)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class RegressionModel(nn.Module):
    def __init__(self):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(X.shape[1], 128)  
        self.fc2 = nn.Linear(128, 64) 
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))  
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = RegressionModel()


criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)

for epoch in range(100):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(data_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        labels = labels.unsqueeze(1)
        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss / len(data_loader)}')


Epoch 1, Loss: 0.01998627418309779
Epoch 2, Loss: 0.016461457654657268
Epoch 3, Loss: 0.01638819464007575
Epoch 4, Loss: 0.014373368583239152
Epoch 5, Loss: 0.013604687344400042
Epoch 6, Loss: 0.012967826533199781
Epoch 7, Loss: 0.012500038252362762
Epoch 8, Loss: 0.012072282633002317
Epoch 9, Loss: 0.011758371297379626
Epoch 10, Loss: 0.011456938566897959
Epoch 11, Loss: 0.011230375290942665
Epoch 12, Loss: 0.01102182969376384
Epoch 13, Loss: 0.010800945730438589
Epoch 14, Loss: 0.010708479370424993
Epoch 15, Loss: 0.010456715814892532
Epoch 16, Loss: 0.010292702480036692
Epoch 17, Loss: 0.01019308622298744
Epoch 18, Loss: 0.01005765921615058
Epoch 19, Loss: 0.009968796233913642
Epoch 20, Loss: 0.009894624177546824
Epoch 21, Loss: 0.009800230820105544
Epoch 22, Loss: 0.009718837159525245
Epoch 23, Loss: 0.009664836409749019
Epoch 24, Loss: 0.009602327818630014
Epoch 25, Loss: 0.00948735103342116
Epoch 26, Loss: 0.009445878561225482
Epoch 27, Loss: 0.009451451200327617
Epoch 28, Loss: 

### calculating Root Mean Squared Error(RMSE)

In [57]:
def calculate_rmse(model, data_loader):
    model.to(device)  
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)  
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            total_loss += loss.item()
    rmse = np.sqrt(total_loss / len(data_loader))
    return rmse

model.to(device)
rmse = calculate_rmse(model, test_data_loader)
print(f'RMSE: {rmse:.4f}')

RMSE: 0.0841


Saving the model

In [58]:
torch.save(model, 'model_2.pth')