# RoBERTa / DeBERTa + LGBM

In [None]:
import pandas as pd
import csv
import random
import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # GPU acc on mac : "mps"


# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')

from transformers import DebertaModel, DebertaTokenizer

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaModel.from_pretrained("model_deberta")  # raw weight : ('microsoft/deberta-base')
model.to(device)

max_length = 256
train_test_split = 3000



class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

In [25]:
train_encodings = tokenizer(training_set['text'].to_list()[0:train_test_split], truncation=True, padding=True,
                                max_length=max_length)
valid_encodings = tokenizer(training_set['text'].to_list()[train_test_split:], truncation=True, padding=True,
                               max_length=max_length)
test_encodings = tokenizer(test_set['text'].to_list(), truncation=True, padding=True,
                               max_length=max_length)

train_y = training_set['label'].to_list()[0:train_test_split]
valid_y = training_set['label'].to_list()[train_test_split:]
test_y = [-1]*4000

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_y)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_y)
test_dataset = NewsGroupsDataset(test_encodings, test_y)

## Embeddings

In [3]:
def get_embedding(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer.encode(text, return_tensors='pt').to(device)
    # perform inference to our model
    outputs = model(inputs)
    return outputs.last_hidden_state[:,0,:] # probs.argmax()

In [4]:
output = get_embedding("It is an incontestable fact that the Emperor Napoléon is a genius")
output.shape

torch.Size([1, 768])

In [5]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=24, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=24, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False)

In [6]:

from tqdm.notebook import tqdm
embeddings = []
for text in tqdm(training_set['text'].to_list()):
    embeddings.append(get_embedding(text).cpu().detach().numpy())
embeddings = np.array(embeddings)
np.save("data/deberta_embeddings_train", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors


In [7]:
embeddings = []
for text in tqdm(test_set['text'].to_list()):
    embeddings.append(get_embedding(text).cpu().detach().numpy())
embeddings = np.array(embeddings)
np.save("data/deberta_embeddings_test", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [26]:
train_embeddings = np.load("data/deberta_embeddings_train.npy")
test_embeddings = np.load("data/deberta_embeddings_test.npy")
train_embeddings.shape, test_embeddings.shape

((4000, 1, 768), (4000, 1, 768))

## Preparing data for classifier

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vect = TfidfVectorizer(max_features=800)
tfidf_train = tfidf_vect.fit_transform(training_set['text']).toarray()
tfidf_test = tfidf_vect.transform(test_set['text']).toarray()

count_vect = CountVectorizer(max_features=100)
count_train = count_vect.fit_transform(training_set['text']).toarray()
count_test = count_vect.transform(test_set['text']).toarray()

In [31]:
X_train = np.hstack((train_embeddings.squeeze(), tfidf_train, count_train))
X_test =  np.hstack((test_embeddings.squeeze(), tfidf_test, count_test))
X_train.shape, X_test.shape

((4000, 1668), (4000, 1668))

In [30]:
X_train = train_embeddings.squeeze()
X_test = test_embeddings.squeeze()
X_train.shape, X_test.shape

((4000, 768), (4000, 768))

### PCA

In [32]:
from sklearn.decomposition import PCA
pca = PCA(n_components=512)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
X_train.shape, X_test.shape

((4000, 512), (4000, 512))

In [33]:
y_train = np.array(training_set['label'].to_list())
y_train.shape

(4000,)

## LGBM

In [37]:
from lightgbm import LGBMClassifier
bst = LGBMClassifier()
split = 3600
bst.fit(X_train[0:split], y_train[0:split])

In [38]:
preds = bst.predict(X_train[split:])

In [39]:
np.sum(y_train[split:] == np.array(preds))/(4000-split)

1.0

In [40]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# Define the parameter grid
param_grid = {
    'num_leaves': [16,32,64,],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100,200,500]
}

# Create a LGBMClassifier object
lgbm = LGBMClassifier()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',verbose=3)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.01, n_estimators=100, num_leaves=16;, score=0.975 total time=   0.9s
[CV 2/5] END learning_rate=0.01, n_estimators=100, num_leaves=16;, score=0.993 total time=   0.8s
[CV 3/5] END learning_rate=0.01, n_estimators=100, num_leaves=16;, score=0.996 total time=   0.7s
[CV 4/5] END learning_rate=0.01, n_estimators=100, num_leaves=16;, score=0.994 total time=   0.7s
[CV 5/5] END learning_rate=0.01, n_estimators=100, num_leaves=16;, score=0.993 total time=   0.7s
[CV 1/5] END learning_rate=0.01, n_estimators=100, num_leaves=32;, score=0.975 total time=   1.2s
[CV 2/5] END learning_rate=0.01, n_estimators=100, num_leaves=32;, score=0.993 total time=   1.1s
[CV 3/5] END learning_rate=0.01, n_estimators=100, num_leaves=32;, score=0.996 total time=   1.2s
[CV 4/5] END learning_rate=0.01, n_estimators=100, num_leaves=32;, score=0.994 total time=   1.1s
[CV 5/5] END learning_rate=0.01, n_estimators=100, num_l

In [41]:
predictions = grid_search.predict(X_test)
predictions.shape

(4000,)

In [42]:
grid_search

In [43]:
with open("submission_deberta_lgbm.csv", "w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id', 'label'])
    for i, row in enumerate(predictions):
        csv_out.writerow([i, row])