In [1]:
import pandas as pd
import numpy as np

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [29]:
data_path = "/content/language-identification-datasets.csv"
data = pd.read_csv(data_path)
submission = data.copy(deep=True)
print(f"Number of data: {len(data)}")
data.head()

Number of data: 31012


Unnamed: 0,Text,Language,ID
0,şirket icra kurulu başkanı melih abdülhayoğlu ...,Turkish,26110
1,"매춘업의 공식화를 주장한 민원식은 ""우又 근경近頃 일본인이 주가 음식매점 등국내 도...",Korean,26862
2,خلال عظاته ركز التعليم اللاهوتي للبابا على الش...,Arabic,20562
3,soggin was born on march he took his first d...,English,14298
4,kulturmejeriet är sedan mitten av -talet en vi...,Swedish,17338


## Split training and testing data

In [18]:
train = data.dropna(axis=0)
print(f"Number of data to train: {len(train)}")
test = data[data['Language'].isna()]
print(f"Number of data to test: {len(test)}")

Number of data to train: 24326
Number of data to test: 6686


## Data exploration

In [32]:
occur = data.groupby(['Language']).size()
print(occur)

Language
Arabic        1190
Chinese        791
Danish         342
Dutch         1226
English       1877
Estonian       768
French        1596
German         379
Greek          293
Hindi          842
Indonesian     794
Italian        537
Japanese       824
Kannada        301
Korean         795
Latin          784
Malayalam      452
Persian        784
Portugese      785
Pushto         786
Romanian       762
Russian       1329
Spanish       1418
Swedish        788
Tamil         1156
Thai           770
Turkish       1170
Urdu           787
dtype: int64


In [33]:
data.Language.unique()

array(['Turkish', 'Korean', 'Arabic', 'English', 'Swedish', 'Tamil',
       'Greek', 'Dutch', 'Portugese', 'Latin', 'French', 'Persian',
       'Chinese', 'Spanish', 'Estonian', 'Indonesian', 'Romanian', 'Thai',
       'Malayalam', 'Hindi', 'Russian', 'Japanese', 'Italian', 'Pushto',
       'Kannada', 'Urdu', 'German', 'Danish', nan], dtype=object)

In [5]:
french_text = data[data['Language'] == "French"]
french_text.head()

Unnamed: 0,Text,Language,ID
15,le nationalisme völkischer promeut une forme p...,French,22757
21,la commune est sectorisée sur le collège franç...,French,26956
54,en juin noël godin se rend avec quelques comp...,French,16477
79,À la suite de l'accord de Paris sur le climat ...,French,3819
80,en les cultivateurs du village se cotisèrent ...,French,30043


In [None]:
languages = pd.unique(train["Language"])
print(len(languages))

occur = data.groupby(['Language']).size()
print(occur)

## Try to consider word as a graph with probability

In [4]:
def proba_matrix(language_df):
    sentences = language_df["Text"].to_numpy()
    words = set()
    # Get all word
    for sentence in sentences:
        for word in sentence.split(" "):
            words.add(word.replace(" ", "").lower())

    # Create probability matrix
    caracts = set()
    for word in words:
        for caract in word:
            caracts.add(caract)
    proba_matrix = np.zeros((len(caracts), len(caracts)))
    caract_index = {word: i for i, word in enumerate(caracts)}

    # Calculate proba
    n = 0
    for w in words:
        for i in range(len(w)-1):
            proba_matrix[caract_index[w[i]], caract_index[w[i+1]]] += 1
            n += 1

    proba_matrix = proba_matrix / n

    return proba_matrix, caract_index

In [5]:
def evaluate(proba_matrix, caract_index, sentence):
    score = 0
    for word in sentence.split(" "):
        w = word.replace(" ", "").lower()

        for i in range(len(w)-1):
            if w[i] in caract_index and  w[i+1] in caract_index:
                score += proba_matrix[caract_index[w[i]], caract_index[w[i+1]]]

    return score

## Test this solution with two language

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
""" Split train and test data for fr and en + find proba matrix """
french_text = data[data['Language'] == "French"]
fr_train, fr_test = train_test_split(french_text, test_size=0.1, random_state=0)

french_matrix, french_index = proba_matrix(fr_train)

english_text = data[data['Language'] == "English"]
en_train, en_test = train_test_split(english_text, test_size=0.1, random_state=0)

english_matrix, english_index = proba_matrix(en_train)

In [8]:
""" Test """
good = 0
bad = 0

for sentence in fr_test["Text"]:
    fr_score = evaluate(french_matrix, french_index, sentence)
    en_score = evaluate(english_matrix, english_index, sentence)
    if fr_score > en_score:
        good += 1
    else:
        bad += 1

for sentence in en_test["Text"]:
    fr_score = evaluate(french_matrix, french_index, sentence)
    en_score = evaluate(english_matrix, english_index, sentence)
    if fr_score < en_score:
        good += 1
    else:
        bad += 1

print(f"N. good pred: {good}, N. bad bad: {bad}")
print(f"Accurace: {good / (good + bad)}%")

N. good pred: 336, N. bad bad: 12
Accurace: 0.9655172413793104%


## Now time to try with all the languages 

In [9]:
""" Create matrix for each language """
sub_train, sub_test = train_test_split(train, test_size=0.1, random_state=0)

matrix_lang_dict = dict()

for lang in train.Language.unique():
    text = sub_train[sub_train['Language'] == lang]
    matrix, index = proba_matrix(text)
    matrix_lang_dict[lang] = (matrix, index)

In [None]:

good_pred = 0
bad_pred = 0

for testing_data in sub_test.itertuples():
    sentence = testing_data.Text
    # Do a prediction
    lang_pred = None
    lang_score = -1
    for lang in matrix_lang_dict.keys():
        score = evaluate(matrix_lang_dict[lang][0], matrix_lang_dict[lang][1], sentence)
        if score > lang_score:
            lang_score = score
            lang_pred = lang

    # Check response
    language = testing_data.Language
    if lang_pred == language:
        good_pred += 1
    else:
        bad_pred += 1

print(f"Accuracy: {good_pred/(good_pred + bad_pred)}")

## Submission with this model

In [19]:
""" Create matrix for each language """
matrix_lang_dict = dict()

for lang in train.Language.unique():
    text = train[train['Language'] == lang]
    matrix, index = proba_matrix(text)
    matrix_lang_dict[lang] = (matrix, index)

In [20]:
test.head()

Unnamed: 0,Text,Language,ID
24326,[6] Encyclopædia Britannica Inc. şirketi ise a...,,7823
24327,disculparse.,,5551
24328,em bon jovi lançou o álbum have a nice day a ...,,12728
24329,mai fostul președinte egiptean mohamed morsi ...,,14197
24330,otok škulj är en ö i kroatien den ligger i län...,,18982


In [32]:
print(submission.iloc[24326])

Text        [6] Encyclopædia Britannica Inc. şirketi ise a...
Language                                  We are on the moon!
ID                                                       7823
Name: 24326, dtype: object


In [35]:
for testing_data in test.itertuples():
    sentence = testing_data.Text
    # Do a prediction
    lang_pred = None
    lang_score = -1
    for lang in matrix_lang_dict.keys():
        score = evaluate(matrix_lang_dict[lang][0], matrix_lang_dict[lang][1], sentence)
        if score > lang_score:
            lang_score = score
            lang_pred = lang

    # Assign it into the submission csv
    submission.at[testing_data.Index, "Language"] = lang_pred

In [36]:
submission.to_csv("/content/challenge-3-HALL-9000.csv")

In [37]:
submission.tail()

Unnamed: 0,Text,Language,ID
31007,그의 조선인 참정권 운동은 점차 조선인 지식인들 일각에서 호응을 얻기 시작하였다 민...,Korean,29033
31008,มีความยาวสูงสุดราว เซนติเมตร เป็นปลาที่พบกระจ...,Thai,15461
31009,เอเธนส์มีประชาชนชายเพียงแค่ คน แต่มีหลายพันที...,Thai,15441
31010,marty ally y buttons se encuentran en el camin...,Spanish,16450
31011,วัฏจักรเหล็กเป็นการหมุนเวียนของเหล็กในสิ่งแวดล...,Thai,19697
