# **<ins>Language Detection for Text</ins>**

## **table of contents:**
-------------------------------------------

## #1: <ins>dataset</ins>

# **Solution #1:**

TF-IDF / CountVec + some models

## #2: <ins>pre-processing</ins>

## #3: <ins>preparing test data</ins>

## #4.0: <ins>my KNN implementation</ins>

## #4: <ins>model & prediction</ins>

## #5: <ins>output</ins>

# **Solution #2:**

TF-IDF + a new feature + SVM

--------------------------------------------------------

In [1]:
!pip install bottleneck

Collecting bottleneck
  Downloading Bottleneck-1.3.2.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 725 kB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Building wheels for collected packages: bottleneck
  Building wheel for bottleneck (PEP 517) ... [?25ldone
[?25h  Created wheel for bottleneck: filename=Bottleneck-1.3.2-cp37-cp37m-linux_x86_64.whl size=334952 sha256=dd3429f5e991fde6be6d54e94d8426822ed0fe60dfd8c3d8fc808c49f2467249
  Stored in directory: /root/.cache/pip/wheels/87/85/9c/a325c89ff0498660ef8a335fb4b3912939c273ea4f094af29f
Successfully built bottleneck
Installing collected packages: bottleneck
Successfully installed bottleneck-1.3.2


In [2]:
import numpy as np
import pandas as pd

import gc
import time

# pre-process:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# models:
from sklearn import linear_model
from sklearn import neighbors
from sklearn import svm

# KNN implementation
from numpy import linalg
import bottleneck as bn
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# here it must be "Pushto" instead of "Pashto" :|
languages = ['Persian','Arabic','Pushto','English','German','French']

## **#1: <ins>dataset</ins>**
I'm using 2 datasets from:

https://www.kaggle.com/zarajamshaid/language-identification-datasst</br>
https://www.kaggle.com/basilb2s/language-detection

the first one does not contain German, so I used the second one too

In [4]:
raw_train_1 = pd.read_csv('../input/language-identification-datasst/dataset.csv')
raw_train_2 = pd.read_csv('../input/language-detection/Language Detection.csv')

raw_train_1.columns = raw_train_2.columns

In [5]:
train = pd.concat([raw_train_1[raw_train_1['Language'].isin(languages)], raw_train_2[raw_train_2['Language'] == 'German']])
train = train.reset_index(drop=True)

In [6]:
# changing "Pushto" to "Pashto" :|
train[train['Language'] == 'Pushto'] = train[train['Language'] == 'Pushto'].replace(['Pushto'], 'Pashto')

In [7]:
set(train['Language'])

{'Arabic', 'English', 'French', 'German', 'Pashto', 'Persian'}

# **Solution #1:**

## **#2: <ins>pre-processing</ins>**

In [8]:
def simple_preProcess(X, code):
    
    if code == 'tfidf':
        vectorizer = TfidfVectorizer()
    elif code == 'countVec':
        vectorizer = CountVectorizer()
    else:
        raise ValueError('invalid value for code')
        
    df = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(X))

    return df, vectorizer.get_feature_names()

In [9]:
#both create/return the same dictionary
X_tfidf, dictionary = simple_preProcess(train['Text'], 'tfidf')
X_countVec, dictionary = simple_preProcess(train['Text'], 'countVec')

In [10]:
Y = train['Language']

## **#3: <ins>preparing test data</ins>**

In [11]:
def simple_test_preprocess(T, dictionary, code):
    
    if code == 'tfidf':
        vectorizer = TfidfVectorizer(vocabulary = dictionary)
    elif code == 'countVec':
        vectorizer = CountVectorizer(vocabulary = dictionary)
    else:
        raise ValueError('invalid value for code')

    df = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(T))

    return df

In [12]:
test = pd.read_csv('../input/final-test/task1-final-withoutlabel-HoseinGhanbari.csv')

In [13]:
test

Unnamed: 0,Id
0,"Das Deutsche ist ein plurizentrische Sprache, ..."
1,منذ دخوله لعالم صناعة الأفلام ساهم المخرج والم...
2,اللُّغَة العَرَبِيّة هي أكثر اللغات السامية تح...
3,امیدوار درباره این‌که آیا راننده تخلفی داشته ی...
4,ابوعلی حسین بن عبدالله بن حسن بن علی بن سینا، ...
...,...
95,The bills are rooted in concerns that I have l...
96,"Es gab Ärger wegen massiven Ruhestörungen, hin..."
97,شنت نجمة البوب ​​الأمريكية بريتني سبيرز هجوما ...
98,یادگیری ماشینی (به انگلیسی: Machine learning) ...


In [14]:
tfidf_preprocessed_test = simple_test_preprocess(test['Id'], dictionary, 'tfidf')
countVec_preprocessed_test = simple_test_preprocess(test['Id'], dictionary, 'countVec')

## **#4: <ins>my KNN implementation</ins>**

In [15]:
def euclidean_distance(X1, X2):
    return linalg.norm(X2-X1, axis=1)

In [16]:
def find_nearest_values(max_k, X_dataset, X_input):
    (samples_num, features_num) = X_dataset.shape
    tests_num = len(X_input)

    result = []

    for test_index in range(tests_num):
        
        distances = euclidean_distance(np.array(X_input.iloc[test_index]), np.array(X_dataset))

        # pick up first <k> neighbours 
        distances = bn.argpartition(distances, kth = max_k)
        knn_indices = distances[:max_k+1]

        result.append(knn_indices)
        
    return result

In [17]:
max_k = 6

In [18]:
def my_KNN_validation(X_train, X_validation, Y, y_validation):
    
    strt_time = time.time()
    print('validating ...')
    knn_indeices = find_nearest_values(max_k, X_train, X_validation)
    print('validation time : ', time.time() - strt_time)

    best_accuracy = 0
    best_k = 3
    for k in range(3, max_k+1):
        y_predict = Y[[stats.mode(knn_indeices[i][:k])[0].flatten()[0] for i in range(len(knn_indeices))]]
        accuracy = accuracy_score(y_validation, y_predict)
        print('k: ', k, ' - accuracy: ', accuracy)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k

    print('best k: ', best_k, best_accuracy)
    return best_k

In [19]:
def my_KNN_predict(X_train, X_test, Y, best_k):
    
    knn_indeices = find_nearest_values(max_k, X_train, X_test)
    T = Y[[stats.mode(knn_indeices[i][:best_k])[0].flatten()[0] for i in range(len(knn_indeices))]]
    return T

## **validation & finding the best k:**

will take a quite long time ...

In [20]:
# X_new_train, X_validation, y_new_train, y_validation = train_test_split(X_tfidf, Y, test_size=0.10)

In [21]:
# best_k = my_KNN_validation(X_new_train, X_validation, Y, y_validation)

## **#4.1: <ins>model<ins>**

### **#4.1.1: <ins>tf-idf<ins>**

In [22]:
SGD_model_tfidf = linear_model.SGDClassifier()
KNN_model_tfidf = neighbors.KNeighborsClassifier(5)
linSVM_model_tfidf = svm.LinearSVC(C=0.1)

In [23]:
SGD_model_tfidf.fit(X_tfidf, Y)

SGDClassifier()

In [24]:
KNN_model_tfidf.fit(X_tfidf, Y)

KNeighborsClassifier()

In [25]:
linSVM_model_tfidf.fit(X_tfidf, Y)

LinearSVC(C=0.1)

### **#4.1.2: <ins>CountVec<ins>**

In [26]:
SGD_model_countVec = linear_model.SGDClassifier()
KNN_model_countVec = neighbors.KNeighborsClassifier(5)
linSVM_model_countVec = svm.LinearSVC(C=0.1)

In [27]:
SGD_model_countVec.fit(X_countVec, Y)

SGDClassifier()

In [28]:
KNN_model_countVec.fit(X_countVec, Y)

KNeighborsClassifier()

In [29]:
linSVM_model_countVec.fit(X_countVec, Y)

LinearSVC(C=0.1)

## **#4.2: <ins>prediction<ins>**

### **#4.2.1: <ins>tf-idf<ins>**

In [30]:
SGD_prediction_tfidf = SGD_model_tfidf.predict(tfidf_preprocessed_test)

In [31]:
KNN_prediction_tfidf = KNN_model_tfidf.predict(tfidf_preprocessed_test)

In [32]:
linSVM_prediction_tfidf = linSVM_model_tfidf.predict(tfidf_preprocessed_test)

will take a long time:

In [None]:
# my_KNN_prediction_tfidf = my_KNN_predict(X_tfidf, tfidf_preprocessed_test, Y, 3)

### **#4.2.2: <ins>CountVec<ins>**

In [34]:
SGD_prediction_countVec = SGD_model_countVec.predict(countVec_preprocessed_test)

In [35]:
KNN_prediction_countVec = KNN_model_countVec.predict(countVec_preprocessed_test)

In [36]:
linSVM_prediction_countVec = linSVM_model_countVec.predict(countVec_preprocessed_test)

## **#5: <ins>output<ins>**

### **#5.1: <ins>tf-idf<ins>**

In [37]:
SGD_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : SGD_prediction_tfidf})

In [38]:
SGD_output_tfidf

Unnamed: 0,Id,Category
0,"Das Deutsche ist ein plurizentrische Sprache, ...",German
1,منذ دخوله لعالم صناعة الأفلام ساهم المخرج والم...,Arabic
2,اللُّغَة العَرَبِيّة هي أكثر اللغات السامية تح...,Arabic
3,امیدوار درباره این‌که آیا راننده تخلفی داشته ی...,Persian
4,ابوعلی حسین بن عبدالله بن حسن بن علی بن سینا، ...,Persian
...,...,...
95,The bills are rooted in concerns that I have l...,English
96,"Es gab Ärger wegen massiven Ruhestörungen, hin...",German
97,شنت نجمة البوب ​​الأمريكية بريتني سبيرز هجوما ...,Arabic
98,یادگیری ماشینی (به انگلیسی: Machine learning) ...,Persian


In [39]:
SGD_output_tfidf.to_csv('./SGD_output_tfidf.csv', index=False)

In [40]:
KNN_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : KNN_prediction_tfidf})

In [41]:
KNN_output_tfidf.to_csv('./KNN_output_tfidf.csv', index=False)

In [42]:
linSVM_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : linSVM_prediction_tfidf})

In [43]:
linSVM_output_tfidf.to_csv('./linSVM_output_tfidf.csv', index=False)

In [44]:
linSVM_output_tfidf

Unnamed: 0,Id,Category
0,"Das Deutsche ist ein plurizentrische Sprache, ...",German
1,منذ دخوله لعالم صناعة الأفلام ساهم المخرج والم...,Arabic
2,اللُّغَة العَرَبِيّة هي أكثر اللغات السامية تح...,Arabic
3,امیدوار درباره این‌که آیا راننده تخلفی داشته ی...,Persian
4,ابوعلی حسین بن عبدالله بن حسن بن علی بن سینا، ...,Persian
...,...,...
95,The bills are rooted in concerns that I have l...,English
96,"Es gab Ärger wegen massiven Ruhestörungen, hin...",German
97,شنت نجمة البوب ​​الأمريكية بريتني سبيرز هجوما ...,Arabic
98,یادگیری ماشینی (به انگلیسی: Machine learning) ...,Persian


In [None]:
# my_KNN_T_tfidf = pd.DataFrame({'Category' : my_KNN_prediction_tfidf})
# my_KNN_T_tfidf = my_KNN_T_tfidf.reset_index(drop = True)

In [None]:
# my_KNN_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : my_KNN_T_tfidf['Category']})

In [None]:
# my_KNN_output_tfidf.to_csv('./my_KNN_output_tfidf.csv', index=False)

In [None]:
# my_KNN_output_tfidf

### **#5.2: <ins>CountVec<ins>**

In [45]:
SGD_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : SGD_prediction_countVec})

In [46]:
SGD_output_countVec

Unnamed: 0,Id,Category
0,"Das Deutsche ist ein plurizentrische Sprache, ...",German
1,منذ دخوله لعالم صناعة الأفلام ساهم المخرج والم...,Arabic
2,اللُّغَة العَرَبِيّة هي أكثر اللغات السامية تح...,Arabic
3,امیدوار درباره این‌که آیا راننده تخلفی داشته ی...,Persian
4,ابوعلی حسین بن عبدالله بن حسن بن علی بن سینا، ...,Persian
...,...,...
95,The bills are rooted in concerns that I have l...,English
96,"Es gab Ärger wegen massiven Ruhestörungen, hin...",German
97,شنت نجمة البوب ​​الأمريكية بريتني سبيرز هجوما ...,Arabic
98,یادگیری ماشینی (به انگلیسی: Machine learning) ...,Persian


In [47]:
SGD_output_countVec.to_csv('./SGD_output_countVec.csv', index=False)

In [48]:
KNN_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : KNN_prediction_countVec})

In [49]:
KNN_output_countVec.to_csv('./KNN_output_countVec.csv', index=False)

In [50]:
linSVM_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : linSVM_prediction_countVec})

In [51]:
linSVM_output_countVec.to_csv('./linSVM_output_countVec.csv', index=False)

# **Solution #2:**
same as Solution #1 , with one extra feature added:

[# words containing special characters of a language ] / [# words of the text]

In [52]:
persian_specs = {'گ'}
pashto_specs = {'ټ', 'څ', 'ځ', 'ډ', 'ړ', 'ږ', 'ښ', 'ګ' , 'ڼ'}
french_specs = {'É' , 'À', 'È' , 'Ù', 'Â', 'Ê', 'Î', 'Ô', 'Û', 'Ë', 'Ï', 'Ç', 'é', 'à', 'è', 'ù', 'â', 'ê', 'î', 'ô', 'û', 'ë', 'ï', 'ç' }
german_specs = {'ß', 'ä', 'ö', 'ü', 'Ä', 'Ö', 'Ü'}

In [53]:
def specs_ratio(entry, specs):
    splited = entry.split()
    words = [word for letter in specs for word in splited if letter in word]
    return len(words)/len(splited)

In [54]:
t1 = time.time()
Pers = test['Id'].apply (lambda row: specs_ratio(row, persian_specs))
Pash = test['Id'].apply (lambda row: specs_ratio(row, pashto_specs))
Germ = test['Id'].apply (lambda row: specs_ratio(row, german_specs))
Fren = test['Id'].apply (lambda row: specs_ratio(row, french_specs))
t2 = time.time()

In [55]:
t2 -t1

0.030729055404663086

In [56]:
new_preprocessed_test = tfidf_preprocessed_test.copy()

In [57]:
new_preprocessed_test['Pers'] = Pers
new_preprocessed_test['Pash'] = Pash
new_preprocessed_test['Germ'] = Germ
new_preprocessed_test['Fren'] = Fren

In [58]:
new_preprocessed_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70255,70256,70257,70258,70259,70260,Pers,Pash,Germ,Fren
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.193548,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.019231,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.112676,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.108434,0.0,0.000000,0.0


In [59]:
t1 = time.time()
Pers = train['Text'].apply (lambda row: specs_ratio(row, persian_specs))
Pash = train['Text'].apply (lambda row: specs_ratio(row, pashto_specs))
Germ = train['Text'].apply (lambda row: specs_ratio(row, german_specs))
Fren = train['Text'].apply (lambda row: specs_ratio(row, french_specs))
t2 = time.time()

In [60]:
t2 - t1

1.0108623504638672

In [61]:
new_preprocessed_X = X_tfidf.copy()

In [62]:
new_preprocessed_X['Pers'] = Pers
new_preprocessed_X['Pash'] = Pash
new_preprocessed_X['Germ'] = Germ
new_preprocessed_X['Fren'] = Fren

In [63]:
new_preprocessed_X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70255,70256,70257,70258,70259,70260,Pers,Pash,Germ,Fren
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.105263
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.260870
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.144578
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.259259
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.026549,0.221239,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.058824,0.000000
5466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.095238,0.000000
5467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.083333,0.000000
5468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.105263,0.000000


In [64]:
model = svm.LinearSVC(C=0.1)

In [65]:
type(new_preprocessed_X)

pandas.core.frame.DataFrame

In [66]:
model.fit(new_preprocessed_X, Y)

  "pandas.DataFrame with sparse columns found."


LinearSVC(C=0.1)

In [67]:
pred = model.predict(new_preprocessed_test)

  "pandas.DataFrame with sparse columns found."


In [68]:
sol2_output = pd.DataFrame({'Id' : test['Id'], 'Category' : pred})

In [69]:
sol2_output.to_csv('./sol2_output.csv', index=False)