In [None]:
import os
from pprint import pprint
from collections import defaultdict
import time

In [None]:
from sklearn import svm
from sklearn import metrics
import pandas as pd
import numpy as np

In [None]:
DATASET_EMBEDDINGS_TRAIN_FILE = os.path.realpath("./assets/embeddings/test/train_embeddings.tsv")
DATASET_EMBEDDINGS_TEST_FILE = os.path.realpath("./assets/embeddings/test/test_embeddings.tsv")

In [None]:
embeddings_train = pd.read_csv(DATASET_EMBEDDINGS_TRAIN_FILE, sep="\t", header=None, index_col=0)
embeddings_train.head()

In [None]:
embeddings_test = pd.read_csv(DATASET_EMBEDDINGS_TEST_FILE, sep="\t", header=None, index_col=0)
embeddings_test.head()

In [None]:
def get_y(df: pd.DataFrame) -> pd.Index:
    return df.index.map(lambda x: 1 if x.split("/")[0] == "toxic" else 0)

In [None]:
train_y = get_y(embeddings_train)
test_y = get_y(embeddings_test)

In [None]:
def train_model(raw_model, X_train, y_train, model_kwargs: dict = {}) -> tuple[svm.SVC, float]:
    start_time = time.time()

    raw_model.fit(X_train, y_train, **model_kwargs)

    end_time = time.time()

    t_delta = end_time - start_time
    
    return raw_model, t_delta

In [None]:
def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)

    return {
        "precision": prec,
        "recall": rec,
        "accuracy": acc,
        "f1": f1,
    }

In [None]:
model_svm_linear = svm.SVC(kernel="linear")

In [None]:
model_svm_linear, training_svc_1_time = train_model(model_svm_linear, embeddings_train, train_y)

In [None]:
metrics_svc_1 = test_model(model_svm_linear, embeddings_test, test_y)

In [None]:
metrics_svc_1

In [None]:
training_svc_1_time

In [None]:
def make_experiment(model_class, models_kwargs: list[dict], X_train, y_train, X_test, y_test) -> dict:
    metrics_out = []
    for experiment_kwargs in models_kwargs:
        model = model_class(**experiment_kwargs)
        model, training_time = train_model(model, X_train, y_train)
        metrics = test_model(model_svm_linear, X_test, y_test)
        metrics_out.append({"training_time": training_time, "metrics": metrics})
    
    return metrics_out


In [None]:
experiments_res_1 = make_experiment(
    svm.SVC,
    [
        {
            "kernel": "linear",
        },
        {
            "kernel": "rbf",
        },
        {
            "kernel": "sigmoid",
        },
    ],
    embeddings_train,
    train_y,
    embeddings_test,
    test_y,
)

pprint(experiments_res_1)

In [None]:
X_train_exp_2 = embeddings_train.copy()
X_test_exp_2 = embeddings_test.copy()

X_train_exp_2.iloc[:,:30] = 0
X_test_exp_2.iloc[:,:30] = 0


X_train_exp_2

In [None]:
experiments_res_2 = make_experiment(
    svm.SVC,
    [
        {
            "kernel": "linear",
        },
        {
            "kernel": "rbf",
        },
        {
            "kernel": "sigmoid",
        },
    ],
    X_train_exp_2,
    train_y,
    X_test_exp_2,
    test_y,
)

pprint(experiments_res_2)

In [None]:
X_train_exp_3 = embeddings_train.copy()
X_test_exp_3 = embeddings_test.copy()

X_train_exp_3.iloc[:,:30] = 0
X_test_exp_3.iloc[:,:30] = 0

X_train_exp_3.iloc[:,60:] = 1
X_test_exp_3.iloc[:,60:] = 1


X_train_exp_3

In [None]:
experiments_res_3 = make_experiment(
    svm.SVC,
    [
        {
            "kernel": "linear",
        },
        {
            "kernel": "rbf",
        },
        {
            "kernel": "sigmoid",
        },
    ],
    X_train_exp_3,
    train_y,
    X_test_exp_3,
    test_y,
)

pprint(experiments_res_3)

In [None]:
X_train_exp_4 = embeddings_train.copy()
X_test_exp_4 = embeddings_test.copy()
X_train_exp_4 = np.sin(X_train_exp_4)
X_test_exp_4 = np.sin(X_test_exp_4)

X_train_exp_4

In [None]:
experiments_res_4 = make_experiment(
    svm.SVC,
    [
        {
            "kernel": "linear",
        },
        {
            "kernel": "rbf",
        },
        {
            "kernel": "sigmoid",
        },
    ],
    X_train_exp_4,
    train_y,
    X_test_exp_4,
    test_y,
)

pprint(experiments_res_4)

In [None]:
def experiments_results_to_dataframe(exp_results: list[dict], exp_params: list[dict], experiments_meta: list[dict], exp_names: list[str] = None) -> pd.DataFrame:
    if exp_names is None:
        exp_names = list(map(str, range(len(exp_results))))

    res_table_dict = defaultdict(list)

    for er in exp_results:
        for er_per_param, exp_param, exp_meta in zip(er, exp_params, experiments_meta):
            res_table_dict["training_time"].append(er_per_param["training_time"])
            
            for k, v in er_per_param["metrics"].items():
                res_table_dict[k].append(v)

            for ep_key, ep_val in exp_param.items():
                res_table_dict[ep_key].append(ep_val)

            for meta_key, meta_val in exp_meta.items():
                res_table_dict[meta_key].append(meta_val)

    res = pd.DataFrame(data=res_table_dict)
    
    return res

In [None]:
experiments_df = experiments_results_to_dataframe(
    [
        experiments_res_1, 
        experiments_res_2,
        experiments_res_3,
        experiments_res_4,
    ],
    exp_params = [
        {
            "kernel": "linear",
        },
        {
            "kernel": "rbf",
        },
        {
            "kernel": "sigmoid",
        },
    ],
    experiments_meta = [
        {
            "experiment_name": "full_mat",
            "model": "SVC",
        },
        {
            "experiment_name": "zeros_left",
            "model": "SVC",
        },
        {
            "experiment_name": "zeros_left__ones_right",
            "model": "SVC",
        },
        {
            "experiment_name": "full_mat__sin",
            "model": "SVC",
        },
    ],
)

In [None]:
experiments_df