In [1]:
!pip install pymongo
import torch
import torch.nn as nn
from pymongo import MongoClient
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import pickle
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting pymongo
  Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.1/677.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.6.1


In [2]:
device

device(type='cpu')

# Define new model

In [3]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
model_name = "kk08/CryptoBERT"
# Load the pre-trained model and tokenizer
bert_model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
for param in bert_model.parameters():
    param.requires_grad = False

config = AutoConfig.from_pretrained(model_name)
max_length = config.max_position_embeddings
print("Max Length:", max_length)

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Max Length: 512


In [4]:
class CustomBERTModel(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(CustomBERTModel, self).__init__()
        self.bert = bert_model
        # Add a dropout layer for regularization
        self.dropout = nn.Dropout(0.1)
        # Add a linear layer for classification
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # Get the outputs from the BERT model
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # Use the pooled output for classification tasks
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

# Dataset

In [5]:
# Knowledge Graph db
CONNECTION_STRING = "mongodb://klgReaderAnalysis:klgReaderAnalysis_4Lc4kjBs5yykHHbZ@35.198.222.97:27017,34.124.133.164:27017,34.124.205.24:27017"
client_kg = MongoClient(CONNECTION_STRING)
db_kg = client_kg['knowledge_graph']
db_kg

Database(MongoClient(host=['34.124.205.24:27017', '35.198.222.97:27017', '34.124.133.164:27017'], document_class=dict, tz_aware=False, connect=True), 'knowledge_graph')

In [6]:
project_db =db_kg.projects

In [7]:
query = {}
projection = {
    "_id" : 1,
    "name": 1,
    "description": 1,
    "category": 1
}

In [8]:
cursor = project_db.find(query, projection)

In [9]:
res = list()
for element in cursor:
    res.append(element)

In [10]:
df = pd.DataFrame()
for element in res:
    df = pd.concat([df, pd.DataFrame(element, index = [0])])
df.dropna(subset = ["category"], inplace = True)
df.dropna(subset = ["description"], inplace = True)
print("Length of full data : {}".format(df.shape[0]))

Length of full data : 13391


In [63]:
category_map = {
  "PFPs": "PFPs",
  "Art": "Art",
  "Dexes": "Dexes",
  "Gaming": "Gaming",
  "Memberships": "Memberships",
  "Yield": "Yield",
  "Lending": "Lending",
  "Derivatives": "Derivatives",
  "Virtual Worlds": "Virtual Worlds",
  "Services": "Services",
  "Photography": "Art",
  "Cexes": "Cexes",
  "Liquid Staking": "Yield",
  "Yield Aggregator": "Yield",
  "Reserve Currency": "Services",
  "Music": "Art",
  "CDP": "Lending",
  "Algo-Stables": "Stablecoins",
  "Farm": "Yield",
  "Indexes": "Indexes",
  "Options": "Derivatives",
  "Bridge": "Infrastructure",
  "Launchpad": "Services",
  "SoFi": "Services",
  "RWA": "Services",
  "NFT Marketplace": "Services",
  "Synthetics": "Services",
  "NFT Lending": "Lending",
  "Prediction Market": "Services",
  "Liquidity manager": "Services",
  "Sports Collectibles": "Art",
  "Chain": "Infrastructure",
  "Cross Chain": "Infrastructure",
  "Insurance": "Services",
  "Domain Names": "Services",
  "Leveraged Farming": "Yield",
  "Staking Pool": "Yield",
  "Payments": "Services",
  "Privacy": "Services",
  "DEX Aggregator": "Services",
  "Options Vault": "Derivatives",
  "Uncollateralized Lending": "Lending",
  "Oracle": "Services",
  "Decentralized Stablecoin": "Stablecoins",
  "RWA Lending": "Lending",
  "Infrastructure": "Infrastructure"
}

In [67]:
df['category'] = df['category'].map(category_map)

In [68]:
df['category'].value_counts()

PFPs              3754
Art               3479
Dexes             1200
Gaming            1157
Yield              897
Memberships        778
Services           692
Lending            530
Derivatives        270
Virtual Worlds     200
Cexes              143
Stablecoins        120
Infrastructure     113
Indexes             58
Name: category, dtype: int64

# Dataloader

In [69]:
class CustomDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_length):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = str(self.data[index])  # Ensure description is a string
        label = self.labels[index]  # Ensure label is an integer

        # Tokenize the description and convert to tensor
        inputs = self.tokenizer(
            description,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # Flatten the tensor to remove unnecessary nested structure
        inputs = {key: val.view(-1).to(device) for key, val in inputs.items()}
        label = torch.tensor(label, dtype=torch.long).to(device)
        return inputs, label

In [72]:
import joblib
label_encoder = LabelEncoder()
# label_encoder = joblib.load("/kaggle/input/encodedlabel/label_encoder.joblib")
numerical_labels = label_encoder.fit_transform(df['category'])

In [73]:
X = list(df["description"])
y = label_encoder.transform(list(df["category"]))

In [74]:
category = list(set(df["category"]))

In [77]:
import random
random.seed(10)
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.1)

In [78]:
print("Size of train set is {}".format(len(X_train)))
print("Size of valid set is {}".format(len(X_valid)))
# print("Size of test set is {}".format(len(X_test)))

Size of train set is 12051
Size of valid set is 1340


In [79]:
train_loader = DataLoader(CustomDataset(X_train, y_train, tokenizer = tokenizer, max_length = max_length),
                          batch_size = 8, shuffle = True)
valid_loader = DataLoader(CustomDataset(X_valid, y_valid, tokenizer = tokenizer, max_length = max_length),
                          batch_size = 8)

In [None]:
# for batch in train_loader:
#     print(batch)
#     break

# Optimize model

## Model

In [80]:
custom_model = CustomBERTModel(bert_model, num_labels= len(category)).to(device)

In [81]:
checkpoint = torch.load("/kaggle/input/bertmodel/best_model.pth")

FileNotFoundError: ignored

In [None]:
custom_model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

## Tranining loop

In [None]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(custom_model.classifier.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.1, verbose=True)
early_stopping_patience = 10
early_stopping_counter = 0
EPOCHS = 100
best_valid_loss = float('inf')
valid_losses = []
for epoch in range(EPOCHS):
    custom_model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        inputs, labels = batch
        optimizer.zero_grad()
        outputs = custom_model(**inputs)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Training Loss: {average_loss}")

    # Validation step
    custom_model.eval()
    with torch.no_grad():
        total_valid_loss = 0
        correct = 0
        total = 0

        for valid_inputs, valid_labels in valid_loader:
            valid_outputs = custom_model(**valid_inputs)
            valid_loss = criterion(valid_outputs, valid_labels.to(device))
            total_valid_loss += valid_loss.item()

            # Calculate accuracy
            _, predicted = torch.max(valid_outputs, 1)
            total += valid_labels.size(0)
            correct += (predicted == valid_labels.to(device)).sum().item()

        valid_loss = total_valid_loss / len(valid_loader)
        valid_accuracy = correct / total * 100
        print(f"Validation Loss: {valid_loss}, Validation Accuracy: {valid_accuracy}%")

        # Learning rate scheduler step based on validation loss
        scheduler.step(valid_loss)

        # Early stopping check
        if epoch > 0 and valid_loss >= min(valid_losses):
            early_stopping_counter += 1
        else:
            early_stopping_counter = 0

        if early_stopping_counter >= early_stopping_patience:
            print(f"Early stopping after {epoch + 1} epochs without improvement.")
            break

        # Keep track of validation losses for early stopping
        valid_losses.append(valid_loss)

        # Save the best model based on validation loss
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save({"model_state_dict" : custom_model.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        'loss': (average_loss, valid_loss, valid_accuracy)
                       }, "/kaggle/working/best_model.pth")
            print("Best model saved!")

100%|██████████| 1506/1506 [03:51<00:00,  6.50it/s]


Epoch 1/100, Training Loss: 1.922908688367284
Validation Loss: 1.925550843278567, Validation Accuracy: 41.74757281553398%
Best model saved!


100%|██████████| 1506/1506 [03:49<00:00,  6.56it/s]


Epoch 2/100, Training Loss: 1.9207378534048518
Validation Loss: 1.9240768910163926, Validation Accuracy: 41.89693801344287%
Best model saved!


100%|██████████| 1506/1506 [03:49<00:00,  6.56it/s]


Epoch 3/100, Training Loss: 1.9217231981703642
Validation Loss: 1.923985546188695, Validation Accuracy: 41.67289021657954%
Best model saved!


100%|██████████| 1506/1506 [03:49<00:00,  6.56it/s]


Epoch 4/100, Training Loss: 1.9183794890900217
Validation Loss: 1.9250646520938193, Validation Accuracy: 42.046303211351756%


100%|██████████| 1506/1506 [03:50<00:00,  6.55it/s]


Epoch 5/100, Training Loss: 1.9162169160200146
Validation Loss: 1.9233705436899549, Validation Accuracy: 41.59820761762509%
Best model saved!


100%|██████████| 1506/1506 [03:50<00:00,  6.54it/s]


Epoch 6/100, Training Loss: 1.920113924767075
Validation Loss: 1.9246657739082973, Validation Accuracy: 41.89693801344287%


100%|██████████| 1506/1506 [03:49<00:00,  6.56it/s]


Epoch 7/100, Training Loss: 1.9144141178127938
Validation Loss: 1.9255427906201, Validation Accuracy: 41.15011202389843%


100%|██████████| 1506/1506 [03:48<00:00,  6.59it/s]


Epoch 8/100, Training Loss: 1.9183483489876882
Validation Loss: 1.9215360649284863, Validation Accuracy: 42.19566840926065%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 9/100, Training Loss: 1.918417431443811
Validation Loss: 1.9241912056292807, Validation Accuracy: 42.046303211351756%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 10/100, Training Loss: 1.917013365988079
Validation Loss: 1.923621239406722, Validation Accuracy: 42.19566840926065%


100%|██████████| 1506/1506 [03:48<00:00,  6.59it/s]


Epoch 11/100, Training Loss: 1.9102245333343546
Validation Loss: 1.920429208448955, Validation Accuracy: 42.046303211351756%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 12/100, Training Loss: 1.9117698099904485
Validation Loss: 1.923006292964731, Validation Accuracy: 41.59820761762509%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 13/100, Training Loss: 1.9147340672345434
Validation Loss: 1.9235295341128396, Validation Accuracy: 41.299477221807315%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 14/100, Training Loss: 1.9132773037925659
Validation Loss: 1.9203181241949399, Validation Accuracy: 41.67289021657954%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 15/100, Training Loss: 1.9100105822244646
Validation Loss: 1.9174921516151655, Validation Accuracy: 41.67289021657954%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 16/100, Training Loss: 1.9134769805003289
Validation Loss: 1.921668940711589, Validation Accuracy: 42.34503360716953%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 17/100, Training Loss: 1.91111597252557
Validation Loss: 1.9189304720078195, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 18/100, Training Loss: 1.9082554878783258
Validation Loss: 1.9169848160374732, Validation Accuracy: 42.56908140403286%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 19/100, Training Loss: 1.9105028504156971
Validation Loss: 1.9177049377134867, Validation Accuracy: 42.34503360716953%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 20/100, Training Loss: 1.9134145325240228
Validation Loss: 1.9154867380857468, Validation Accuracy: 40.85138162808066%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.61it/s]


Epoch 21/100, Training Loss: 1.9094929545919892
Validation Loss: 1.9195722064801626, Validation Accuracy: 41.822255414488424%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 22/100, Training Loss: 1.904993560090958
Validation Loss: 1.917291589436077, Validation Accuracy: 42.94249439880508%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 23/100, Training Loss: 1.9171622346834358
Validation Loss: 1.9126997905827703, Validation Accuracy: 42.1209858103062%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 24/100, Training Loss: 1.9008213411843475
Validation Loss: 1.9153731962045033, Validation Accuracy: 42.27035100821509%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 25/100, Training Loss: 1.901845235788173
Validation Loss: 1.9142842849805242, Validation Accuracy: 41.97162061239731%


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 26/100, Training Loss: 1.905411498757156
Validation Loss: 1.912399230613595, Validation Accuracy: 42.19566840926065%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 27/100, Training Loss: 1.9020189675360246
Validation Loss: 1.913411404050532, Validation Accuracy: 42.1209858103062%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 28/100, Training Loss: 1.903685293745393
Validation Loss: 1.9130505965579123, Validation Accuracy: 41.89693801344287%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 29/100, Training Loss: 1.9049037166563163
Validation Loss: 1.9125920817965554, Validation Accuracy: 42.49439880507842%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 30/100, Training Loss: 1.9038481978306256
Validation Loss: 1.9136974921538716, Validation Accuracy: 41.67289021657954%
Epoch 00030: reducing learning rate of group 0 to 1.0000e-06.


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 31/100, Training Loss: 1.9022215015068156
Validation Loss: 1.9128081624706585, Validation Accuracy: 41.59820761762509%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 32/100, Training Loss: 1.9002739730505038
Validation Loss: 1.9121950651918138, Validation Accuracy: 41.822255414488424%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 33/100, Training Loss: 1.89801535069705
Validation Loss: 1.911745867558888, Validation Accuracy: 42.19566840926065%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 34/100, Training Loss: 1.8965944453856227
Validation Loss: 1.9116279787960506, Validation Accuracy: 42.046303211351756%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 35/100, Training Loss: 1.9004739436653664
Validation Loss: 1.9114179675068175, Validation Accuracy: 42.27035100821509%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 36/100, Training Loss: 1.9002847208840596
Validation Loss: 1.911274100698176, Validation Accuracy: 42.19566840926065%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 37/100, Training Loss: 1.9018266864744315
Validation Loss: 1.9113219483267694, Validation Accuracy: 42.046303211351756%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 38/100, Training Loss: 1.8978344099534301
Validation Loss: 1.9112048195231528, Validation Accuracy: 42.1209858103062%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 39/100, Training Loss: 1.9002808374791822
Validation Loss: 1.91097091847942, Validation Accuracy: 42.27035100821509%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 40/100, Training Loss: 1.8990465063022903
Validation Loss: 1.9107732705417133, Validation Accuracy: 42.34503360716953%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 41/100, Training Loss: 1.8967039551076343
Validation Loss: 1.9105277817164148, Validation Accuracy: 42.41971620612397%
Best model saved!


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 42/100, Training Loss: 1.896137356164446
Validation Loss: 1.9107138975745155, Validation Accuracy: 42.34503360716953%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 43/100, Training Loss: 1.8973301024550935
Validation Loss: 1.9107013752772695, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 44/100, Training Loss: 1.9035009587032703
Validation Loss: 1.9103211196405547, Validation Accuracy: 42.49439880507842%
Best model saved!


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 45/100, Training Loss: 1.8974867971690685
Validation Loss: 1.9105308708690463, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 46/100, Training Loss: 1.8986992340084725
Validation Loss: 1.9103776337135405, Validation Accuracy: 42.49439880507842%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 47/100, Training Loss: 1.892796438212097
Validation Loss: 1.9103353435084933, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 48/100, Training Loss: 1.8965328424021226
Validation Loss: 1.9104232408461117, Validation Accuracy: 42.41971620612397%
Epoch 00048: reducing learning rate of group 0 to 1.0000e-07.


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 49/100, Training Loss: 1.8959373485440436
Validation Loss: 1.9104109466785477, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 50/100, Training Loss: 1.9000132063707982
Validation Loss: 1.910398152257715, Validation Accuracy: 42.49439880507842%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 51/100, Training Loss: 1.8937125506391563
Validation Loss: 1.9103902727365494, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 52/100, Training Loss: 1.8952457050324119
Validation Loss: 1.9103714667615437, Validation Accuracy: 42.41971620612397%
Epoch 00052: reducing learning rate of group 0 to 1.0000e-08.


100%|██████████| 1506/1506 [03:48<00:00,  6.60it/s]


Epoch 53/100, Training Loss: 1.896691193499888
Validation Loss: 1.9103695249983244, Validation Accuracy: 42.41971620612397%


100%|██████████| 1506/1506 [03:47<00:00,  6.61it/s]


Epoch 54/100, Training Loss: 1.892923368559099
Validation Loss: 1.9103686646336602, Validation Accuracy: 42.41971620612397%
Early stopping after 54 epochs without improvement.
