In [1]:
from typing import Iterable
import requests
import tarfile
import os
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from typing import Any
import unittest

In [2]:
def filter_language(members: tarfile.TarFile,
                    language: str) -> Iterable[tarfile.TarInfo]:
    for tarinfo in members:
        if os.path.splitext(tarinfo.name)[1] == "."+language:
            yield tarinfo

Creating a dictionary of all considered languages

In [3]:
languages = ['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu',
             'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv']
language_names = {
    'bg': 'Bulgarian',
    'cs': 'Czech',
    'da': 'Danish',
    'de': 'German',
    'el': 'Greek',
    'en': 'English',
    'es': 'Spanish',
    'et': 'Estonian',
    'fi': 'Finnish',
    'fr': 'French',
    'hu': 'Hungarian',
    'it': 'Italian',
    'lt': 'Lithuanian',
    'lv': 'Latvian',
    'nl': 'Dutch',
    'pl': 'Polish',
    'pt': 'Portuguese',
    'ro': 'Romanian',
    'sk': 'Slovak',
    'sl': 'Slovenian',
    'sv': 'Swedish'
}

Downloading and unpacking documents for all considered languages from EU
website which are transleted versions of the set of the same *documents*

In [None]:
for language in languages:
    if language != 'en':
        url = "https://www.statmt.org/europarl/v7/{}-en.tgz".format(language)
        response = requests.get(url, stream=True)
        tar = tarfile.open(fileobj=response.raw, mode="r|gz")
        tar.extractall(members=filter_language(tar, language))
        tar.close()
    else:
        url = "https://www.statmt.org/europarl/v7/bg-en.tgz"
        response = requests.get(url, stream=True)
        tar = tarfile.open(fileobj=response.raw, mode="r|gz")
        tar.extractall(members=filter_language(tar, language))
        tar.close()

Connecting every document with its language and merging them into one dataset

In [None]:
huge_df = pd.DataFrame([])
for language, language_name in language_names.items():
    if language == 'en':
        huge_df[language_name] = pd.read_csv(
            "europarl-v7.bg-en.en", "utf-8", header=None,
            names=[language_name], engine='python')
    else:
        huge_df[language_name] = pd.read_csv(
            "europarl-v7.{}-en.{}".format(language, language),
            "utf-8", header=None, names=[language_name], engine='python')

Function which given a line of text converts it to lowercase and getting rid of digits

In [None]:
def uniform(line: Any) -> Any:
    if line is not None and len(line) != 0:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        return line
    else:
        None

In [None]:
huge_uniform_df = huge_df.replace(r'[^\w\s]', '', regex=True).astype(str)
huge_uniform_df = huge_uniform_df.applymap(uniform)
huge_uniform_df_melted = pd.melt(huge_uniform_df)

Extracting the independent and dependent variable out of
dataframe and reducing number of variables due to limited RAM

In [None]:
X = huge_uniform_df_melted["value"]
Y = huge_uniform_df_melted["variable"]
X = X[::700]
Y = Y[::700]
le = LabelEncoder()
Y = le.fit_transform(Y)
X = np.array(X)

In [None]:
data_list = []
for i in range(len(X)):
    text = str(X[i]).lower()
    data_list.append(text)

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(data_list).toarray()

dataframe_vocabulary = pd.DataFrame(X.toarray(), columns=cv.vocabulary_)
dataframe_vocabulary[:5]

Train - test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)

Model training

In [None]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

Training metrics

In [None]:
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [None]:
print("Accuracy is :", ac)
cm_df = pd.DataFrame(cm, index=languages, columns=languages)
plt.figure(figsize=(20, 15))
sns.heatmap(cm_df, annot=True, cmap=sns.cubehelix_palette(as_cmap=True))
plt.show()

Examples

In [None]:
def preprocess_sentence(sentence: str) -> np.ndarray:
    processed_sentence = uniform(sentence)
    processed_sentence = cv.transform([processed_sentence]).toarray()
    return processed_sentence

In [None]:
def predict_language(sentence: str) -> str:
    processed_sentence = preprocess_sentence(sentence)
    predicted_label = le.inverse_transform(
        model.predict(processed_sentence))[0]
    return predicted_label

In [None]:
input_sentence = "Exemplary sentence"
predicted_language = predict_language(input_sentence)
print("Predicted Language:", predicted_language)

Test

In [None]:
class TestNotebook(unittest.TestCase):

    def test_add(self):
        self.assertEqual(uniform("To jest 1. Unit Test."), "to jest unit test")


unittest.main(argv=[''], verbosity=2, exit=False)