In [1]:
# Bert + XGBoost

In [278]:
import pandas as pd
import csv
import random
import numpy as np
import pandas as pd

import torch
from sklearn.metrics import accuracy_score

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")  # GPU acc on mac : "mps"


# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')


# tokenizer 2 https://huggingface.co/docs/transformers/tokenizer_summary
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
max_length = 128
train_test_split = 3000



class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # calculate accuracy using sklearn's function
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
    }


In [279]:
train_encodings = tokenizer(training_set['text'].to_list()[0:train_test_split], truncation=True, padding=True,
                                max_length=max_length)
valid_encodings = tokenizer(training_set['text'].to_list()[train_test_split:], truncation=True, padding=True,
                               max_length=max_length)
test_encodings = tokenizer(test_set['text'].to_list(), truncation=True, padding=True,
                               max_length=max_length)

train_y = training_set['label'].to_list()[0:train_test_split]
valid_y = training_set['label'].to_list()[train_test_split:]
test_y = [-1]*4000

# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_y)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_y)
test_dataset = NewsGroupsDataset(test_encodings, test_y)

In [280]:
# model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
model = BertForSequenceClassification.from_pretrained("./model").to(device)

In [281]:
def get_embedding(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
    # perform inference to our model
    outputs = model.bert(**inputs)
    return outputs # probs.argmax()

In [282]:
output = get_embedding("It is an incontestable fact that The Emporer Napoléon is a genius")
output["last_hidden_state"].shape, output["pooler_output"].shape

(torch.Size([1, 19, 768]), torch.Size([1, 768]))

In [283]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=24, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=24, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False)

In [284]:
from tqdm.notebook import tqdm
embeddings = []
for text in tqdm(training_set['text'].to_list()):
    # print(get_embedding(text)["pooler_output"].detach().numpy().shape)
    embeddings.append(get_embedding(text)["pooler_output"].cpu().detach()[0].numpy())
embeddings = np.array(embeddings)
np.save("data/bert_embeddings_train", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [285]:
from tqdm.notebook import tqdm
embeddings = []
for text in tqdm(test_set['text'].to_list()):
    # print(get_embedding(text)["pooler_output"].detach().numpy().shape)
    embeddings.append(get_embedding(text)["pooler_output"].cpu().detach()[0].numpy())
embeddings = np.array(embeddings)
np.save("data/bert_embeddings_test", embeddings)

  0%|          | 0/4000 [00:00<?, ?it/s]

## Embeddings

In [1]:
train_embeddings = np.load("data/bert_embeddings_train.npy")
test_embeddings = np.load("data/bert_embeddings_test.npy")
train_embeddings.shape, test_embeddings.shape

NameError: name 'np' is not defined

In [263]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vect = TfidfVectorizer(max_features=800)
tfidf_train = tfidf_vect.fit_transform(training_set['text']).toarray()
tfidf_test = tfidf_vect.transform(test_set['text']).toarray()

count_vect = CountVectorizer(max_features=100)
count_train = count_vect.fit_transform(training_set['text']).toarray()
count_test = count_vect.transform(test_set['text']).toarray()

In [264]:
X_train = np.hstack((train_embeddings, tfidf_train, count_train))
X_test =  np.hstack((test_embeddings, tfidf_test, count_test))
X_train.shape, X_test.shape

((4000, 1668), (4000, 1668))

In [265]:
y_train = np.array(training_set['label'].to_list())

## Dimensionality Reduction

In [266]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=800)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
X_train.shape, X_test.shape

((4000, 800), (4000, 800))

## XGBoost

In [273]:
from xgboost import XGBClassifier

# create model instance
bst = XGBClassifier(n_estimators=3000, max_depth=4, learning_rate=0.1, objective='binary:hinge', tree_method='gpu_hist', gpu_id=0)
# fit model
split = 4000
bst.fit(X_train[0:split], y_train[0:split])
#bst.fit(train_embeddings[0:split], y_train[0:split])
# make predictions

In [274]:
# preds = bst.predict(train_embeddings[split:])
preds = bst.predict(X_train[split:])

In [275]:
np.sum(y_train[split:] == np.array(preds))/(4000-split)

  np.sum(y_train[split:] == np.array(preds))/(4000-split)


nan

In [276]:
predictions = bst.predict(X_test)
predictions.shape

(4000,)

In [277]:
with open("submission.csv", "w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id', 'label'])
    for i, row in enumerate(predictions):
        csv_out.writerow([i, row])