In [None]:
from utils.tokenizer import load_tokenizer
import pandas as pd
import os
from utils.const import PREPROCESSED_DATA_DIR
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, metrics
import tensorflow as tf
from gensim.models import Word2Vec
import multiprocessing

In [None]:
def load_datasets() -> tuple[pd.DataFrame]:
    return (
        pd.read_excel(os.path.join(PREPROCESSED_DATA_DIR, "train.xlsx")),
        pd.read_excel(os.path.join(PREPROCESSED_DATA_DIR, "valid.xlsx")),
        pd.read_excel(os.path.join(PREPROCESSED_DATA_DIR, "test.xlsx")),
    )

In [None]:
cores = multiprocessing.cpu_count()  # Count the number of cores in a computer

In [None]:
w2v_model = Word2Vec(
    min_count=20,
    window=2,
    vector_size=300,
    sample=6e-5,
    alpha=0.03,
    min_alpha=0.0007,
    negative=20,
    workers=cores - 1,
)

In [None]:
trn, val, test = load_datasets()

tokens_trn = trn["text"].str.split()
tokens_val = val["text"].str.split()
tokens_test = test["text"].str.split()


In [None]:
w2v_model.build_vocab(tokens_trn, progress_per=10000)
w2v_model.train(tokens_trn, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
# w2v_model.init_sims(replace=True) # Funkcja sprawia że model nie będzie już więcej uczony(oszczędza to pamięć)

In [None]:
# w2v_model.wv.most_similar(positive=["poland"])
w2v_model.wv.most_similar(positive=["beautiful"])


### Tokenization tensorflow

In [None]:
trn, val, test = load_datasets()

tokenizer = load_tokenizer()
tokens_trn = tokenizer.texts_to_sequences(trn["text"])
tokens_val = tokenizer.texts_to_sequences(val["text"])
tokens_test = tokenizer.texts_to_sequences(test["text"])

vector_size = 20  # duzszy vector nie dawal lepszej dokladnosci

# Padding <- kazdy tekst ma te sama dlugosc
padding_trn = tf.keras.utils.pad_sequences(tokens_trn, vector_size, padding="post")
padding_val = tf.keras.utils.pad_sequences(tokens_val, vector_size, padding="post")
padding_test = tf.keras.utils.pad_sequences(tokens_val, vector_size, padding="post")

### Tokenization Word2Vec

In [None]:
for emotion in trn.columns[1:]:
    y_trn = trn[emotion]
    # y_val = val[emotion]
    y_test = test[emotion]

    # train
    clf = svm.SVC()
    clf.fit(padding_trn, y_trn)

    # test
    y_pred = clf.predict(padding_test)

    print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
print(len(y_test))
len(y_test.where(y_test > 0).dropna())