In [None]:
import os

import numpy as np
import time

from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch
import pytorch_lightning as pl

In [None]:
logger = pl.loggers.TensorBoardLogger("tensorboard_logs/")

In [None]:
train_embeddings = "../assets/annotated-corpus/train_embeddings.tsv"
test_embeddings = "../assets/annotated-corpus/test_embeddings.tsv"
topics = os.listdir("../assets/annotated-corpus/train")
topics

In [None]:
def prepare_dataset(filename):
    x_raw = []
    y_raw = []
    with open(filename) as f:
        lines = f.readlines()[:-1]
        for line in lines:
            y_raw.append(topics.index(line.split("\t", 1)[0].split("/")[0]))
            x_raw.append(list(map(float, line.split("\t", 1)[1].split("\t"))))

    return np.array(x_raw), np.array(y_raw)

In [None]:
x_train, y_train = prepare_dataset(train_embeddings)
x_test, y_test = prepare_dataset(test_embeddings)

In [None]:
x_train

In [None]:
clf1 = SVC()

In [None]:
clf1.fit(x_train, y_train)

In [None]:
ln = len(x_test[0])
for l in range(len(x_test)):
  if len(x_test[l]) != ln:
    print(f"trouble at: {l}")

In [None]:
preds = clf1.predict(x_test)

In [None]:
def calculate_raw_metrics(pred, gt):
    metrics_dict = {}
    if pred.shape != gt.shape:
        raise RuntimeError("Shapes doesn't fit")
    for i in gt:
        if i not in metrics_dict.keys():
             metrics_dict[i] = {"tp": 0, "fp": 0, "tn": 0, "fn": 0}
    for i in range(pred.shape[0]):
        if pred[i] == gt[i]:
            metrics_dict[gt[i]]["tp"] += 1
            for j in metrics_dict.keys():
                if j != gt[i]:
                    metrics_dict[j]["tn"] += 1
        else:
            metrics_dict[pred[i]]["fp"] += 1
            metrics_dict[gt[i]]["fn"] += 1

    return metrics_dict

In [None]:
def calculate_metrics(pred, gt):
    raw_dict = calculate_raw_metrics(pred, gt)
    metrics_dict = {}
    global_tp = 0
    global_fp = 0
    global_tn = 0
    global_fn = 0
    metrics_dict["precision_macro"] = 0
    metrics_dict["recall_macro"] = 0
    metrics_dict["f1_macro"] = 0

    for i in raw_dict.keys():
        metrics_dict[i] = {}
        metrics_dict[i]["precision"] = raw_dict[i]["tp"] / (raw_dict[i]["tp"] + raw_dict[i]["fp"])
        metrics_dict[i]["recall"] = raw_dict[i]["tp"] / (raw_dict[i]["tp"] + raw_dict[i]["fn"])
        metrics_dict[i]["f1"] = 2 * raw_dict[i]["tp"] / (2 * raw_dict[i]["tp"] + raw_dict[i]["fp"] + raw_dict[i]["fn"])
        global_tp += raw_dict[i]["tp"]
        global_tn += raw_dict[i]["tn"]
        global_fn += raw_dict[i]["fn"]
        global_fp += raw_dict[i]["fp"]
        metrics_dict["precision_macro"] += metrics_dict[i]["precision"] / len(raw_dict.keys())
        metrics_dict["recall_macro"] += metrics_dict[i]["recall"] / len(raw_dict.keys())
        metrics_dict["f1_macro"] += metrics_dict[i]["f1"] / len(raw_dict.keys())

    metrics_dict["precision_micro"] = global_tp / (global_tp + global_fp)
    metrics_dict["recall_micro"] = global_tp / (global_tp + global_fn)
    metrics_dict["f1_micro"] = 2 * global_tp / (2 * global_tp + global_fn + global_fp)
    metrics_dict["accuracy"] = global_tp / gt.shape[0]
    return metrics_dict

In [None]:
def reference_metrics(pred, gt):
    print("Precision:", precision_score(gt, pred, average=None))
    print("Recall:", recall_score(gt, pred, average=None))
    print("F1:", f1_score(gt, pred, average=None))
    print("Precision macro:", precision_score(gt, pred, average="macro"))
    print("Recall macro:", recall_score(gt, pred, average="macro"))
    print("F1 macro:", f1_score(gt, pred, average="macro"))
    print("Precision micro:", precision_score(gt, pred, average="micro"))
    print("Recall micro:", recall_score(gt, pred, average="micro"))
    print("F1 micro:", f1_score(gt, pred, average="micro"))
    print("Accuracy:", accuracy_score(gt, pred))

In [None]:
calculate_metrics(preds, y_test)

In [None]:
reference_metrics(preds, y_test)

In [None]:
def train_evaluate_model(model, x_train, y_train, x_test, y_test):
    t_begin = time.time()
    model.fit(x_train, y_train, )
    t_end = time.time()
    preds = model.predict(x_test)
    metrics = calculate_metrics(preds, y_test)
    print("Precision macro:", metrics["precision_macro"])
    print("Recall macro:", metrics["recall_macro"])
    print("F1 macro:", metrics["f1_macro"])
    print("Precision micro:", metrics["precision_micro"])
    print("Recall micro:", metrics["recall_micro"])
    print("F1 micro:", metrics["f1_micro"])
    print("Accuracy:", metrics["accuracy"])
    print("Time:", t_end - t_begin)
    return model

In [None]:
svm_linear = train_evaluate_model(SVC(kernel="linear"), x_train, y_train, x_test, y_test)

In [None]:
svm_poly = train_evaluate_model(SVC(kernel="poly"), x_train, y_train, x_test, y_test)

In [None]:
svm_rbf = train_evaluate_model(SVC(kernel="rbf"), x_train, y_train, x_test, y_test)

In [None]:
svm_sigmoid = train_evaluate_model(SVC(kernel="sigmoid"), x_train, y_train, x_test, y_test)

In [None]:
!pip install pytorch-lightning

In [None]:
import torch
import pytorch_lightning as pl

In [None]:
class MLP(pl.LightningModule):

    def __init__(self):
        super().__init__()

        # Building a linear encoder
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(100, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.ReLU(),
            torch.nn.Linear(2048, 1024),
            torch.nn.ReLU(),
            torch.nn.Linear(1024, 512),
            torch.nn.ReLU(),
            torch.nn.Linear(512, 4),
            torch.nn.Softmax(dim=0)
        )
        self.loss_func = torch.nn.CrossEntropyLoss()

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        x = batch[:, :100]
        y = batch[:, 100:]
        y_hat = self.layers(x)
        loss = self.loss_func(y_hat, y)

        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x = batch[:, :100]
        y = batch[:, 100:]
        y_hat = self.layers(x)
        loss = self.loss_func(y_hat, y)
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),
                                     lr=1e-5,
                                     weight_decay=1e-8)
        return optimizer

In [None]:
y_train_onehot = torch.nn.functional.one_hot(torch.tensor(y_train, dtype=torch.int64), num_classes = 4).float()
data = torch.cat((torch.tensor(x_train, dtype=torch.float32), y_train_onehot), dim=1)

In [None]:
y_test_onehot = torch.nn.functional.one_hot(torch.tensor(y_test, dtype=torch.int64), num_classes = 4).float()
data_test = torch.cat((torch.tensor(x_test, dtype=torch.float32), y_test_onehot), dim=1)

In [None]:
!pip install -U tensorboardx
!pip3 install -U tensorboardx

import sys
!{sys.executable} -m venv venv  # создание виртуальной среды
!{sys.executable} -m pip install -U tensorboard tensorboardX

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=data, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=data_test, batch_size=32, shuffle=False)
early_stop_callback = pl.callbacks.EarlyStopping(monitor="val_loss")
checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")
logger = pl.loggers.TensorBoardLogger("tensorboard_logs/")
trainer = pl.Trainer(max_epochs=200, logger=logger)
from lightning_utilities.core.imports import RequirementCache

print(RequirementCache("tensorboard"))
print(RequirementCache("tensorboardx"))

In [None]:
mlp = MLP()
trainer.fit(mlp, train_loader, test_loader)

In [None]:
preds = torch.argmax(mlp.forward(torch.tensor(x_test).float()).detach(), dim=1).numpy()

In [None]:
calculate_metrics(preds, y_test)

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train, y_train, x_test, y_test)

In [None]:
perm = np.random.permutation(len(x_train))

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train[perm][:9000], y_train[perm][:9000], x_test, y_test)

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train[perm][:6000], y_train[perm][:6000], x_test, y_test)

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train[perm][:2000], y_train[perm][:2000], x_test, y_test)

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train[:, 10:90], y_train, x_test[:, 10:90], y_test)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
pca.fit(x_train)

In [None]:
train_evaluate_model(SVC(kernel="rbf"), pca.transform(x_train), y_train, pca.transform(x_test), y_test)

In [None]:
pca_2 = PCA(n_components=2)

In [None]:
draw_data = pca_2.fit_transform(x_train[perm][:500])

In [None]:
draw_data

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(draw_data[:, 0], draw_data[:, 1], c=y_train[perm][:500], cmap="tab20")

In [None]:
x_train_extended = np.concatenate((x_train, np.sin(x_train), np.cos(x_train)), axis=1)
x_test_extended = np.concatenate((x_test, np.sin(x_test), np.cos(x_test)), axis=1)

In [None]:
x_train_extended.shape

In [None]:
train_evaluate_model(SVC(kernel="rbf"), x_train_extended, y_train, x_test_extended, y_test)