In [1]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from utils import sentences_by_lang, languages, get_trigrams_sets, encode, get_words_set
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

### Przygotowanie danych

In [2]:
# Wczytujemy plik csv tworząc DF z dwoma kolumanmi "lang" oraz "sentence"
csv_file = pd.read_csv('sentences.csv', on_bad_lines='skip', sep='\t', index_col=0, names=["lang", "sentence" ])

In [3]:
# Filtrujemy tabelę, zostawiamy tylko wspierane języki. Dla każdego języka zostawiamy SENTENCES_BY_LANG zdań.
dataset = csv_file[csv_file['lang'].isin(languages)]
results = pd.DataFrame(columns=["lang","sentence"])
for l in languages:
    ds = dataset[dataset["lang"] == l].sample(sentences_by_lang)
    results = pd.concat([results, ds])
results["sentence"] = results["sentence"].str.lower() # pomijamy wielkość liter, aby nie traktować osobno np. "He" i "he"

In [4]:
all_words = get_words_set(results)
display(all_words)

{'"in',
 'a',
 'abbiamo',
 'aber',
 'able',
 'about',
 'ad',
 'after',
 'again.',
 'ahora',
 'al',
 'algo',
 'alguien',
 'all',
 'alla',
 'alle',
 'alles',
 'als',
 'always',
 'am',
 'an',
 'an.',
 'anche',
 'ancora',
 'and',
 'andare',
 'andate',
 'anderen',
 'andiamo',
 'antes',
 'any',
 'anything',
 'aquí',
 'aquí.',
 'arbeit',
 'are',
 'as',
 'asked',
 'así',
 'at',
 'auch',
 'auf',
 'aus',
 'aus.',
 'avere',
 'avete',
 'aveva',
 'años',
 'años.',
 'back',
 'be',
 'been',
 'before',
 'bei',
 'beim',
 'bene.',
 'besser',
 'better',
 'bien',
 'bien.',
 'bin',
 'bis',
 'bisogno',
 'bist',
 'bitte',
 'boston',
 'boston.',
 'buch',
 'buen',
 'buena',
 'but',
 'buy',
 'by',
 "c'è",
 'cada',
 'can',
 "can't",
 'casa',
 'casa.',
 'casi',
 'cercare',
 'che',
 'chi',
 'ci',
 'come',
 'como',
 'con',
 'cosa',
 'cosas',
 'cose',
 'costruire',
 'così',
 'could',
 'creo',
 'cuando',
 'cui',
 'cómo',
 'da',
 'dal',
 'dalla',
 'dann',
 'das',
 'dass',
 'davvero',
 'de',
 'debería',
 'decir',
 'deg

### Bag of words

In [5]:
# Tworzymy bag of words, nie wykorzystujemy binarnego bag of words ponieważ trigramy w zdaniu mogą się powtórzyć i stracilibyśmy tę informację.
# Wadą BoW jest fakt, że każdy trigram jest tak samo ważny, ale w naszym problemie to nie przeszkadza.
dic = dict()
for i,t in enumerate(all_words):
    dic[t]=i


In [6]:
vectorizer = CountVectorizer(vocabulary=dic, tokenizer= lambda sentence : sentence.split(" "))

#with open('words_count_vectorizer.pkl', 'wb') as f:
#    pickle.dump(vectorizer, f)

train_sentences = results["sentence"]
train_langs = results["lang"]
X = vectorizer.fit_transform(train_sentences)
train_features = pd.DataFrame(data=X.toarray(), columns=all_words)

display(train_features)
train_min = train_features.min() # najmniejsza wartość z każdej kolumny
train_max = train_features.max() # największa wartość z każdej kolumny
train_features = (train_features - train_min)/(train_max-train_min) # do poprawy bo zwraca NaN, jeśli max value = 0
train_features["lang"] = list(results["lang"]) # dodajemy dodatkową kolumnę z naszym outputem
display(train_features)

Unnamed: 0,be,ser,lot,andare,today.,casa,mir,i've,fare,hai,...,por,del,visto,who,la,los,ni,tiempo,hacer,demasiado
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
799996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
799997,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
799998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


Unnamed: 0,be,ser,lot,andare,today.,casa,mir,i've,fare,hai,...,del,visto,who,la,los,ni,tiempo,hacer,demasiado,lang
0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,eng
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,eng
2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,eng
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,eng
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,ita
799996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,ita
799997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,ita
799998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,ita


In [37]:
maxx = train_features.drop("lang", axis=1).max()
minn = train_features.drop("lang", axis=1).min()


In [42]:
sub = np.subtract(maxx,minn)

In [52]:
1. in sub.unique()

True

### Asercje czy wszystko przebiegło pomyślnie, czy w naszym zbiorze nie ma NaN

In [8]:
assert 0 not in train_max, "Jeden z najczęściej występujących trigramów nie wystąpił ani razu"
assert 0 not in (train_max-train_min), "Nie można dzielić przez 0"
assert not train_max.isnull().values.any(),  "NaN w wektorze train_max"
assert not train_min.isnull().values.any(), "NaN w wektorze train_min"
assert not (train_max-train_min).isnull().values.any(), "NaN w mianowniku train_min"
assert not train_features.isnull().values.any(), "NaN w wynikowym DataFrame"

#display(X)
#display(train_features)
#display(train_langs)

### Przygotowanie danych testowych i treningowych 

In [9]:

encoder = LabelEncoder()
encoder.fit(languages)
with open('words_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)
x = train_features.drop('lang',axis=1)
y = encode(train_features['lang'], encoder)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

### Przygotowanie modelu

In [10]:
model = Sequential()
model.add(Dense(500, input_dim=len(X_train.columns), activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### INFO korzystania z GPU lub CPU, można zakomentować jeśli nie chce się korzystać z GPU

In [11]:
# from tensorflow.python.client import device_lib
# from keras import backend as K
# 
# print(device_lib.list_local_devices())
# 
# K._get_available_gpus()
# 
# print(tf.config.experimental.list_physical_devices())
# 
# print("------------------------------------------------------------------------------------------")
# physical_devices = tf.config.list_physical_devices('GPU')
# try:
#   print(physical_devices)
#   tf.config.experimental.set_memory_growth(physical_devices[0], True)
# except:
#   # Invalid device or cannot modify virtual devices once initialized.
#   pass
# 
# #Train model
# # Create a MirroredStrategy.
# tf.debugging.set_log_device_placement(True)

### Uruchomienie treningu

In [12]:
# Z użyciem gpu
#with tf.device('/GPU:0'):
model.fit(X_train, y_train, epochs=1, batch_size=6)
    
# Bez użycia gpu
#model.fit(x, y, epochs=30, batch_size=6)

RuntimeError: Data adapters should be mutually exclusive for handling inputs. Found multiple adapters [<class 'keras.engine.data_adapter.TensorLikeDataAdapter'>, <class 'keras.engine.data_adapter.GeneratorDataAdapter'>] to handle input: <class 'pandas.core.frame.DataFrame'>, <class 'numpy.ndarray'>

### Test precyzji

In [None]:
labels=model.predict(X_test)
display(labels)
predictions = [encoder.classes_[np.argmax(label)] for label in labels]
correct = [encoder.classes_[np.argmax(label)] for label in y_test]
display(predictions)
display(correct)
accuracy_score(correct,predictions)

In [None]:
model.save("words_recognition")
del model