In [1]:
from sklearn.model_selection import train_test_split
%load_ext autoreload
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
%load_ext autoreload
%autoreload 2
from utils import prepare_dataframe, sentences_by_lang
df = prepare_dataframe()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Przygotowanie danych

In [3]:
from utils import languages, create_feature_dictionary, trigrams_by_lang

X_train: [str] = []
X_test: [str] = []
y_train: [str]= []
y_test: [str]= []

words = set()

for lang in languages:
    sentences = df[df["lang"] == lang]["sentence"]
    Xt, xtest = train_test_split(sentences, test_size = 0.2)
    yt =  len(Xt) * [lang]
    ytest =  len(xtest) * [lang]
    X_train.extend(Xt)
    y_train.extend(yt)
    X_test.extend(xtest)
    y_test.extend(ytest)

    vectorizer = CountVectorizer(max_features=trigrams_by_lang, min_df=5)
    X = vectorizer.fit_transform(Xt)
    words.update(vectorizer.vocabulary_)

vocabulary = create_feature_dictionary(words)

In [4]:
vectorizer = CountVectorizer(vocabulary=vocabulary)
X = vectorizer.fit_transform(X_train)
features = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names_out())

In [5]:
from utils import normalize

normalize(features)

Unnamed: 0,qué,alla,tell,stato,zum,sein,in,se,nos,siempre,...,el,están,como,deine,eres,detto,let,aus,sagte,bene
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319995,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319996,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319997,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
319998,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from utils import FFN_Hyperparams, build_model
params = FFN_Hyperparams(len(features.columns), len(languages), [500,500,250], 'relu')
model = build_model(params)

In [7]:
from utils import create_encoder, encode

encoder = create_encoder()
y = encode(y_train, encoder)

In [None]:
from utils import DataGenerator

train_gen = DataGenerator(features, y, 32)
history = model.fit(train_gen, epochs=30, batch_size=6)

Epoch 1/30
Epoch 2/30
Epoch 3/30

In [None]:
plt.plot(history.history['accuracy'])
plt.title('Trafność modelu na podstawie słów')
plt.ylabel('Trafność')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()
# "Loss"
plt.plot(history.history['loss'])
plt.title('Funkcja kosztu')
plt.ylabel('Koszt')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
from utils import test_model

vectorizer = CountVectorizer(vocabulary=vocabulary)
X = vectorizer.fit_transform(X_test)
features = pd.DataFrame(data=X.toarray(), columns=vectorizer.get_feature_names_out())

encoder = create_encoder()
y = encode(y_test, encoder)

In [None]:
accuracy = test_model(model, encoder, features, y)

In [None]:
accuracy