In [1]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from utils import languages, get_trigrams_sets, encode, prepare_dataframe, normalize, \
    FFN_Hyperparams, build_model, create_encoder, test_model, create_feature_dictionary
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### Przygotowanie danych

In [2]:
df = prepare_dataframe()

In [3]:
all_trigrams, _ = get_trigrams_sets(df)

### Bag of words

In [4]:
# Tworzymy bag of words, nie wykorzystujemy binarnego bag of words ponieważ trigramy w zdaniu mogą się powtórzyć i stracilibyśmy tę informację.
# Wadą BoW jest fakt, że każdy trigram jest tak samo ważny, ale w naszym problemie to nie przeszkadza.
dic = create_feature_dictionary(all_trigrams)

vectorizer = CountVectorizer(vocabulary=dic, ngram_range=(3,3), analyzer="char") #ngram_range bierzemy tylko trigramy, analyzer bierzemy pod uwagę znaki,// char_wb nie zliczało poprawnie kolumn
with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

sentences = df["sentence"]
langs = df["lang"]
X = vectorizer.fit_transform(sentences)
# Tworzymy macierz wystąpień poszcególnych trigramów
features = pd.DataFrame(data=X.toarray(), columns=all_trigrams)

### Normalizacja

In [5]:
features = normalize(features)
features["lang"] = list(langs) # dodajemy dodatkową kolumnę z naszym outputem

### Podział na dane testowe i treningowe

In [6]:
encoder = create_encoder()
x = features.drop('lang',axis=1)
y = encode(features['lang'], encoder)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

### Przygotowanie modelu

In [7]:
params = FFN_Hyperparams(len(X_train.columns), len(languages), [500,500,250], 'relu')
model = build_model(params)

### INFO o urządzeniach

In [8]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Uruchomienie treningu

In [None]:
# Z użyciem gpu
#with tf.device('/GPU:0'):
#    model.fit(X_train, y_train, epochs=25, batch_size=6)
    
# Bez użycia gpu
model.fit(x, y, epochs=30, batch_size=6)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

### Test precyzji

In [None]:
accuracy = test_model(model, encoder, X_test, y_test)

In [None]:
print("Accuracy: ", accuracy)
if accuracy > 0.8:
    model.save("trigrams_recognition")
    with open('trigrams_encoder.pkl', 'wb') as f:
        pickle.dump(encoder, f)
    with open('trigrams_count_vectorizer.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)