In [None]:
!pip install fasttext transformers sacremoses
!pip install -U kaleido
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from google.colab import drive
import pandas as pd
import plotly.express as px
import os
import json
import spacy
import re
import pickle
import sklearn
from pathlib import Path
import time
import fasttext
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import numpy as np
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score  
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV
from transformers import AutoTokenizer, AutoModel

tqdm.pandas()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
drive_path = '/content/drive/'
BASE_DIR = drive_path + 'MyDrive/ColabNotebooks/studia/NLP/L2'
drive.mount(drive_path, force_remount=True)
log_dir = BASE_DIR + '/logs/'
FIGURES_DIR = BASE_DIR + "/figures"

booking_path = os.path.join(BASE_DIR, "booking.txt")
dataframes_path = os.path.join(BASE_DIR, "dataframes")

EMBEDDING_FILES = [
    (os.path.join(dataframes_path, "clean_inputs_clean_booking_cbow.pkl"), "Booking CBOW"),
    (os.path.join(dataframes_path, "clean_inputs_clean_booking_skipgram.pkl"), "Booking Skip-Gram"),
    (os.path.join(dataframes_path, "clean_inputs_clean_oscar_cbow.pkl"), "Oscar CBOW"),
    (os.path.join(dataframes_path, "clean_inputs_clean_oscar_skipgram.pkl"), "Oscar Skip-Gram"),
    (os.path.join(dataframes_path, "clean_herbert.pkl"), "HerBERT"),
    (os.path.join(dataframes_path, "clean_kgr10.pkl"), "KGR10 CBOW"),
]

Mounted at /content/drive/


## Model klasyfikacji

In [None]:
class training_set(torch.utils.data.Dataset):
    def __init__(self,X,Y):
        self.X = torch.from_numpy(X).float()                          
        self.Y = torch.from_numpy(Y).float()                     

    def __len__(self):
        return len(self.X)                   

    def __getitem__(self, idx):
      return [self.X[idx], self.Y[idx]] 

def to_dl(x, y):
  train_data = training_set(x, y)
  return torch.utils.data.DataLoader(train_data, shuffle=False, batch_size=32)


class BookingClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    def __init__(self, epochs=20, in_dim=100, hid_dim=50, layers=2, **kwargs):
        self.epochs = epochs
        self.in_dim = in_dim
        self.hid_dim = hid_dim
        self.layers = layers
        self.model = ...

    def init_model(self):
        self.model = nn.Sequential()
        curr_in = self.in_dim
        for i in range(self.layers):
            self.model.add_module(f"lin_{i}", torch.nn.Linear(curr_in, self.hid_dim)) 
            self.model.add_module(f"lin_{i}_act" , torch.nn.ReLU())
            curr_in = self.hid_dim

        self.model.add_module("lin_last", torch.nn.Linear(curr_in, 1)) 
        self.model.add_module("last_sigm" , torch.nn.Sigmoid())
        self.model.cuda()

    def fit(self, X, y):
        self.init_model()
        train_dl = to_dl(X, y)
        optimiser = optim.Adam(self.model.parameters())
        loss_fn = torch.nn.BCELoss()
        for epoch in range(self.epochs):
            for X_batch, y_batch in train_dl:
                y_pred = self.model(X_batch.cuda())
                loss = loss_fn(y_pred, y_batch.cuda())

                loss.backward()
                optimiser.step()
                optimiser.zero_grad() 
        return self

    def predict(self, X):
        return torch.round(self.model(torch.from_numpy(X).float().cuda()).detach().cpu()).numpy()

## Uczenie i wyniki

In [None]:
def get_training_data(df, column, max_samples):
    df = df[df[column].notnull()].sample(max_samples, random_state=1)
    return np.array(df[column].values.tolist())

def load_embeddings(path, max_samples_per_class=100):
    df = pd.read_pickle(path)
    x_res = []
    y_res = []
    for y_val, col in [(1, "vec_positive"), (0, "vec_negative")]:
        x = get_training_data(df, col, max_samples_per_class)
        x_res.append(x)
        y_res.append(np.full((x.shape[0], 1), y_val))
    
    return np.concatenate(x_res, axis=0), np.concatenate(y_res, axis=0)

In [None]:
metrics = ["Accuracy", "F1", "Recall"]
results = {}
for path, name in EMBEDDING_FILES:
    X, y = load_embeddings(path, max_samples_per_class=5000)
    print(X.shape)
    model = BookingClassifier()
    params = HalvingGridSearchCV(
        model, 
        {
            "in_dim": [X.shape[-1]],
            "epochs": [20, 35, 50],
            "layers": [1, 2, 5, 8, 10],
            "hid_dim": [50, 100, int(X.shape[-1]*0.6)]
        }, 
        resource='n_samples',
        scoring='accuracy',
        random_state=1, 
        cv=StratifiedKFold(n_splits=4, random_state=1, shuffle=True),
        verbose=1,
        n_jobs=-1
    ).fit(X, y).best_params_
    print(params)

    model = BookingClassifier(**params)
    cv_results = cross_validate(
        model, X, y, 
        cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True), 
        scoring=('accuracy', 'f1_weighted', 'recall'), 
        n_jobs=3
    )
    results[name] = {
        "Accuracy": cv_results["test_accuracy"].tolist(),
        "F1": cv_results["test_f1_weighted"].tolist(),
        "Recall": cv_results["test_recall"].tolist(),
        "params": params
    }

(10000, 100)
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 20, 'hid_dim': 50, 'in_dim': 100, 'layers': 2}
(10000, 100)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits
----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 35, 'hid_dim': 100, 'in_dim': 100, 'layers': 1}
(10000, 100)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits
----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 35, 'hid_dim': 50, 'in_dim': 100, 'layers': 1}
(10000, 100)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits
----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 50, 'hid_dim': 100, 'in_dim': 100, 'layers': 1}
(10000, 768)
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 20, 'hid_dim': 50, 'in_dim': 768, 'layers': 1}
(10000, 100)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 370
max_resources_: 10000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 45
n_resources: 370
Fitting 4 folds for each of 45 candidates, totalling 180 fits
----------
iter: 1
n_candidates: 15
n_resources: 1110
Fitting 4 folds for each of 15 candidates, totalling 60 fits
----------
iter: 2
n_candidates: 5
n_resources: 3330
Fitting 4 folds for each of 5 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 9990
Fitting 4 folds for each of 2 candidates, totalling 8 fits
{'epochs': 50, 'hid_dim': 100, 'in_dim': 100, 'layers': 1}


In [None]:
for metric in metrics:
    fig = go.Figure(layout_title_text=f"{metric} - folds")
    for name, data in results.items():
        fig.add_trace(go.Bar(x=[f"fold_{i}" for i, _ in enumerate(data[metric])], y=data[metric], name=name))
    fig.show(renderer='colab')
    fig.write_image(os.path.join(FIGURES_DIR, f"{metric}_folds.png"))

In [None]:
for metric in metrics:
    fig = go.Figure(layout_title_text=f"{metric}")
    for name, data in results.items():
        fig.add_trace(go.Box(y=data[metric], name=name, boxpoints='outliers'))
    fig.show(renderer='colab')
    fig.write_image(os.path.join(FIGURES_DIR, f"{metric}.png"))