In [1]:
from typing import Set, Any, Dict, List, Tuple
import matplotlib.pyplot as pt
import numpy as np
import pandas as pd

### Przygotowanie danych

In [2]:
languages = ["eng", "deu","spa", "ita"] # wspierane języki
sentences_by_lang = 200000 # ile zdań bierzemy pod uwagę w poszczególnych językach
trigrams_by_lang = 200 # ile trigramów z danego języka bierzemy pod uwagę

In [3]:
from collections import Counter
def get_trigrams(sentence: str):
    """
    Wydobywa trigramy ze zdania.
    :param sentence: Zdanie w postaci łańcucha znaków.
    :return: Wszystkie trigramy znajdujące się w zdaniu.
    """
    trigrams = []
    for i, c in enumerate(sentence):
        trigram = sentence[i:(i + 3)]
        if len(trigram) == 3:
            trigrams.append(trigram)
    return trigrams

def get_trigrams_sets(df: pd.DataFrame) -> tuple[set, dict]:
    """
    Funkcja dla każdego zdania w tabeli zlicza ilość wystąpień trigramów,
    następnie dla każdego języka zwraca  TRIGRAMS_BY_LANG najpopularniejszych trigramów.
    Na końcu tworzy zbiór składający się z najpopularniejszych trigramów każdego języka.
    :param df: DataFrame zawierająca kolumny "sentence" oraz "lang"
    :return: biór najpopularniejszych trigramów ogólnie, słownik [lang] -> [trigramy w danym języku...]
    """
    all_trigrams: set[str] = set()
    lang_trigrams: dict[str, list[tuple[str, int]]] = dict()
    for lang in languages:
        trigrams = Counter()
        series = df[df["lang"] == lang]["sentence"]
        for sentence in series:
            tri = get_trigrams(sentence)
            trigrams.update(tri)
        trigrams += Counter() # usuwa elemnty z count=0
        mc = trigrams.most_common(trigrams_by_lang)
        all_trigrams.update([v[0] for v in mc])
        lang_trigrams[lang] = mc
    return all_trigrams, lang_trigrams

In [4]:
# Wczytujemy plik csv tworząc DF z dwoma kolumanmi "lang" oraz "sentence"
csv_file = pd.read_csv('sentences.csv', on_bad_lines='skip', sep='\t', index_col=0, names=["lang", "sentence" ])

In [5]:
# Filtrujemy tabelę, zostawiamy tylko wspierane języki. Dla każdego języka zostawiamy SENTENCES_BY_LANG zdań.
dataset = csv_file[csv_file['lang'].isin(languages)]
results = pd.DataFrame(columns=["lang","sentence"])
for l in languages:
    ds = dataset[dataset["lang"] == l].sample(sentences_by_lang)
    results = pd.concat([results, ds])
results["sentence"] = results["sentence"].str.lower() # pomijamy wielkość liter, aby nie traktować osobno np. "He" i "he"

In [6]:
all_trigrams, lang_trigrams = get_trigrams_sets(results)

In [7]:
### Bag of words

In [8]:
# Tworzymy bag of words, nie wykorzystujemy binarnego bag of words ponieważ trigramy w zdaniu mogą się powtórzyć i stracilibyśmy tę informację.
# Wadą BoW jest fakt, że każdy trigram jest tak samo ważny, ale w naszym problemie to nie przeszkadza.
from sklearn.feature_extraction.text import CountVectorizer
dic = dict()
for i,t in enumerate(all_trigrams):
    dic[t]=i
vectorizer = CountVectorizer(vocabulary=dic, ngram_range=(3,3), analyzer="char") #ngram_range bierzemy tylko trigramy, analyzer bierzemy pod uwagę znaki,// char_wb nie zliczało poprawnie kolumn
train_sentences = results["sentence"]
train_langs = results["lang"]
X = vectorizer.fit_transform(train_sentences)
train_features = pd.DataFrame(data=X.toarray(), columns=all_trigrams)
train_min = train_features.min() # najmniejsza wartość z każdej kolumny
train_max = train_features.max() # największa wartość z każdej kolumny
train_features = (train_features - train_min)/(train_max-train_min) # do poprawy bo zwraca NaN, jeśli max value = 0
train_features["lang"] = list(results["lang"]) # dodajemy dodatkową kolumnę z naszym outputem

In [9]:
# Asercje czy wszystko przebiegło pomyślnie
# assert 0 not in train_max
# assert 0 not in (train_max-train_min)
# assert not train_max.isnull().values.any()
# assert not train_min.isnull().values.any()
# assert not (train_features - train_min).isnull().values.any()
# assert not (train_max-train_min).isnull().values.any()
# assert not train_features.isnull().values.any(), train_features.isnull().sum().sum()

display(X)
display(train_features)
display(train_langs)

<800000x547 sparse matrix of type '<class 'numpy.int64'>'
	with 15859081 stored elements in Compressed Sparse Row format>

Unnamed: 0,ca,las,ehr,on',go,zio,sei,va,e p,ed,...,ir,che,ha,on,he,qua,hen,i w,sch,lang
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.019231,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,eng
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,eng
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,eng
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.019231,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,eng
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.019231,...,0.0,0.0,0.00,0.0,0.0,0.0,0.083333,0.0,0.0,eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,ita
799996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,ita
799997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,ita
799998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.02,0.0,0.0,0.0,0.000000,0.0,0.0,ita


9165666    eng
6577477    eng
6357152    eng
4488244    eng
6930358    eng
          ... 
7614388    ita
5168654    ita
1647260    ita
2862669    ita
4528234    ita
Name: lang, Length: 800000, dtype: object

### Przygotowanie modelu 

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, train_langs, test_size = 0.2)

In [11]:
import tensorflow as tf 
from tensorflow import keras 
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

#Fit encoder
encoder = LabelEncoder()
encoder.fit(languages)

def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy

### fit -> stochastic gradient

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score

#Get training data
x = train_features.drop('lang',axis=1)
y = encode(train_features['lang'])
#Define model
model = Sequential()
model.add(Dense(500, input_dim=len(x.columns), activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=5, batch_size=6)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b4e3326430>

### epoches - number of times you go through the training set
### batch_size - size of training set before changing values of variables (accuracy etc.)

In [13]:
labels=model.predict(X_test)



In [14]:
predictions = [encoder.classes_[np.argmax(label)] for label in labels]

accuracy_score(y_test,predictions)

0.9386875