In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [9]:
import pandas as pd
import csv
import random
import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # GPU acc on mac : "mps"


# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')

from transformers import DebertaModel, DebertaTokenizer

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaModel.from_pretrained('microsoft/deberta-base')
model.to(device)

max_length = 256
train_test_split = 3000



class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
train_encodings = tokenizer(training_set['text'].to_list()[0:train_test_split], truncation=True, padding=True,
                                max_length=max_length)
valid_encodings = tokenizer(training_set['text'].to_list()[train_test_split:], truncation=True, padding=True,
                               max_length=max_length)
test_encodings = tokenizer(test_set['text'].to_list(), truncation=True, padding=True,
                               max_length=max_length)

train_y = training_set['label'].to_list()[0:train_test_split]
valid_y = training_set['label'].to_list()[train_test_split:]
test_y = [-1]*4000

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_y)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_y)
test_dataset = NewsGroupsDataset(test_encodings, test_y)

## Embeddings

In [11]:
def get_embedding(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer.encode(text, return_tensors='pt').to(device)
    # perform inference to our model
    outputs = model(inputs)
    return outputs.last_hidden_state[:,0,:] # probs.argmax()

In [12]:
output = get_embedding("It is an incontestable fact that The Emporer Napoléon is a genius")
output.shape

torch.Size([1, 768])

In [13]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=24, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=24, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False)

In [15]:

from tqdm.notebook import tqdm
embeddings = []
for text in tqdm(training_set['text'].to_list()):
    embeddings.append(get_embedding(text).cpu().detach().numpy())
embeddings = np.array(embeddings)
np.save("data/bert_embeddings_train", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (746 > 512). Running this sequence through the model will result in indexing errors


In [16]:
embeddings = []
for text in tqdm(test_set['text'].to_list()):
    embeddings.append(get_embedding(text).cpu().detach().numpy())
embeddings = np.array(embeddings)
np.save("data/bert_embeddings_test", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [17]:
train_embeddings = np.load("data/bert_embeddings_train.npy")
test_embeddings = np.load("data/bert_embeddings_test.npy")
train_embeddings.shape, test_embeddings.shape

((4000, 1, 768), (4000, 1, 768))

## Preparing data for classifier

In [25]:
# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# tfidf_vect = TfidfVectorizer(max_features=800)
# tfidf_train = tfidf_vect.fit_transform(training_set['text']).toarray()
# tfidf_test = tfidf_vect.transform(test_set['text']).toarray()

# count_vect = CountVectorizer(max_features=100)
# count_train = count_vect.fit_transform(training_set['text']).toarray()
# count_test = count_vect.transform(test_set['text']).toarray()

In [26]:
# X_train = np.hstack((train_embeddings.squeeze(), tfidf_train, count_train))
# X_test =  np.hstack((test_embeddings.squeeze(), tfidf_test, count_test))
# X_train.shape, X_test.shape

((4000, 1668), (4000, 1668))

In [38]:
X_train = train_embeddings.squeeze()
X_test = test_embeddings.squeeze()
X_train.shape, X_test.shape

((4000, 768), (4000, 768))

### PCA

In [33]:
# from sklearn.decomposition import PCA
# pca = PCA(n_components=512)
# pca.fit(X_train)
# X_train = pca.transform(X_train)
# X_test = pca.transform(X_test)
# X_train.shape, X_test.shape

((4000, 512), (4000, 512))

In [39]:
y_train = np.array(training_set['label'].to_list())
y_train.shape

(4000,)

## LGBM

In [40]:
from lightgbm import LGBMClassifier
bst = LGBMClassifier()
split = 3000
bst.fit(X_train[0:split], y_train[0:split])

In [44]:
preds = bst.predict(X_train[split:])

In [45]:
np.sum(y_train[split:] == np.array(preds))/(4000-split)

0.791

In [48]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# Define the parameter grid
param_grid = {
    'num_leaves': [64],
    'learning_rate': [0.05],
    'n_estimators': [200]
}

# Create a LGBMClassifier object
lgbm = LGBMClassifier()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=lgbm,
                           param_grid=param_grid,
                           cv=5,
                           scoring='accuracy',verbose=3)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END learning_rate=0.05, n_estimators=200, num_leaves=64;, score=0.780 total time= 1.1min
[CV 2/5] END learning_rate=0.05, n_estimators=200, num_leaves=64;, score=0.811 total time= 1.0min
[CV 3/5] END learning_rate=0.05, n_estimators=200, num_leaves=64;, score=0.807 total time= 1.0min


KeyboardInterrupt: ignored