## Классификатор языков 

In [62]:
from sklearn.model_selection import train_test_split

import tensorflow as tf

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import string

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.feature_extraction.text import CountVectorizer

In [38]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Администратор.WI
[nltk_data]     N-A4RBFQ3J62N\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
punctuation_chars = string.punctuation

In [43]:
data_dutch = pd.read_csv('dutch.txt', sep=" ", header=None)
data_dutch.columns = ["words"]

#preproccesing
data_dutch["words"] = data_dutch["words"].str.lower()

for char in punctuation_chars:
    data_dutch["words"] = data_dutch["words"].map(lambda x: x.replace(char,""))


dutch_sw = stopwords.words('dutch')

data_dutch = data_dutch[~data_dutch["words"].isin(dutch_sw)]

#Let 0 - dutch, 1 - hungarian, 2 - portugese

data_dutch["0"] = 1
data_dutch["1"] = 0
data_dutch["2"] = 0
print(data_dutch.head())
print(data_dutch.size)


   words  0  1  2
11    we  1  0  0
32   jij  1  0  0
36  weet  1  0  0
42   wel  1  0  0
48  goed  1  0  0
18088


In [44]:
data_hungarian = pd.read_csv('hungarian.txt', sep=" ", header=None)
data_hungarian.columns = ["words"]

#preproccesing
data_hungarian["words"] = data_hungarian["words"].str.lower()

for char in punctuation_chars:
    data_hungarian["words"] = data_hungarian["words"].map(lambda x: x.replace(char,""))


hungarian_sw = stopwords.words('hungarian')

data_hungarian = data_hungarian[~data_hungarian["words"].isin(hungarian_sw)]

#Let 0 - dutch, 1 - hungarian, 2 - portugese

data_hungarian["0"] = 0
data_hungarian["1"] = 1
data_hungarian["2"] = 0
print(data_hungarian.head())
print(data_hungarian.size)

      words  0  1  2
7        is  0  1  0
12       ha  0  1  0
23       te  0  1  0
31    tudom  0  1  0
43  rendben  0  1  0
17512


In [47]:
data_portuguese = pd.read_csv('portugese.txt', sep=" ", header=None)
data_portuguese.columns = ["words"]

#preproccesing
data_portuguese["words"] = data_portuguese["words"].str.lower()

for char in punctuation_chars:
    data_portuguese["words"] = data_portuguese["words"].map(lambda x: x.replace(char,""))


portuguese_sw = stopwords.words('portuguese')

data_portuguese = data_portuguese[~data_portuguese["words"].isin(portuguese_sw)]

#Let 0 - dutch, 1 - hungarian, 2 - portugese

data_portuguese["0"] = 0
data_portuguese["1"] = 0
data_portuguese["2"] = 1
print(data_portuguese.head())
print(data_portuguese.size)

    words  0  1  2
5     ser  0  0  1
12    ter  0  0  1
17  estar  0  0  1
20  fazer  0  0  1
21  poder  0  0  1
19804


In [120]:
#concating all together

frames = [data_dutch, data_hungarian,data_portuguese]
  
result_df = pd.concat(frames)
result_df.reset_index(drop=True)

from sklearn.utils import shuffle
result_df = shuffle(result_df).reset_index(drop=True)

result_df["words"].replace('\d+', '', regex=True)

0              sujo
1                pó
2         állítólag
3              galo
4        velocidade
            ...    
13846    ministerie
13847          vége
13848        gyenge
13849       místico
13850       bewaken
Name: words, Length: 13851, dtype: object

In [121]:
#converting words to vectors of char

result_df["words"] = result_df["words"].map(lambda x: list(x))
result_df.head()

Unnamed: 0,words,0,1,2
0,"[s, u, j, o]",0,0,1
1,"[p, ó]",0,0,1
2,"[á, l, l, í, t, ó, l, a, g]",0,1,0
3,"[g, a, l, o]",0,0,1
4,"[v, e, l, o, c, i, d, a, d, e]",0,0,1


In [122]:
flat_list = [x for xs in result_df["words"] for x in xs]

In [123]:
#set of unique chars
chars_set = set(flat_list)

In [124]:
len(chars_set)

45

In [125]:
chars_dict = {k: v for v, k in enumerate(chars_set)}

In [126]:
chars_dict

{'7': 0,
 'r': 1,
 'õ': 2,
 'í': 3,
 'ã': 4,
 'y': 5,
 'â': 6,
 's': 7,
 'ï': 8,
 'c': 9,
 'i': 10,
 'ê': 11,
 'ő': 12,
 't': 13,
 'ë': 14,
 'n': 15,
 'm': 16,
 'e': 17,
 'ü': 18,
 'z': 19,
 'è': 20,
 'o': 21,
 'u': 22,
 'b': 23,
 'f': 24,
 'x': 25,
 'g': 26,
 'l': 27,
 'a': 28,
 'h': 29,
 'j': 30,
 'ó': 31,
 'd': 32,
 'q': 33,
 'ç': 34,
 'ű': 35,
 'ú': 36,
 'v': 37,
 'é': 38,
 'p': 39,
 'k': 40,
 'ö': 41,
 'á': 42,
 'ô': 43,
 'w': 44}

In [127]:
#функция, которая преобразует слово в вектор по принципу bag-of-words
def vectorize(chars_list, chars_dict):
    vec = np.zeros(len(chars_dict)).astype(int)

    for char in chars_list:
        index = chars_dict[char]
        vec[index] += 1
    
    return vec


In [128]:
result_df["words"] = result_df["words"].map(lambda x: vectorize(x,chars_dict))
result_df.head()

Unnamed: 0,words,0,1,2
0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1
2,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0,1,0
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...",0,0,1


In [157]:
cols = ['0', '1', '2']
labels = result_df[cols].values.tolist()

data = result_df["words"].values.tolist()

In [172]:
data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [184]:
data_train = np.asarray(data_train)
data_test = np.asarray(data_test)
labels_train = np.asarray(labels_train)
labels_test = np.asarray(labels_test)

In [173]:
#creating model


model = Sequential()
model.add(Dense(300, input_dim=len(chars_dict), activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(3, activation='softmax'))
# compile the keras model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [185]:
model.fit(data_train, labels_train, epochs=20, batch_size=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1e434fe8790>

In [186]:
_, accuracy = model.evaluate(np.asarray(data_test), np.asarray(labels_test))
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 81.16
