<h1><B><U>Language Detection for Text</U></B></h1>
<h3>Mohamad Javad Sahebnasi


## #1: <ins>dataset</ins>

# **Solution #1:**

TF-IDF / CountVec + some models

## #2: <ins>pre-processing</ins>

## #3: <ins>preparing test data</ins>

## #4.0: <ins>my KNN implementation</ins>

## #4: <ins>model & prediction</ins>

## #5: <ins>output</ins>

# **Solution #2:**

TF-IDF + a new feature + SVM

In [None]:
!pip install bottleneck

In [None]:
import numpy as np
import pandas as pd

import gc
import time

# pre-process:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# models:
from sklearn import linear_model
from sklearn import neighbors
from sklearn import svm

# KNN implementation
from numpy import linalg
import bottleneck as bn
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# here it must be "Pushto" instead of "Pashto" :|
languages = ['Persian','Arabic','Pushto','English','German','French']

<h2><B>#1: <u>dataset</u></B></h2>
I'm using 2 datasets from:

https://www.kaggle.com/zarajamshaid/language-identification-datasst</br>
https://www.kaggle.com/basilb2s/language-detection

In [None]:
raw_train_1 = pd.read_csv('../input/language-identification-datasst/dataset.csv')
raw_train_2 = pd.read_csv('../input/language-detection/Language Detection.csv')

raw_train_1.columns = raw_train_2.columns

In [None]:
train = pd.concat([raw_train_1[raw_train_1['Language'].isin(languages)], raw_train_2[raw_train_2['Language'] == 'German']])
train = train.reset_index(drop=True)

In [None]:
# changing "Pushto" to "Pashto" :|
train[train['Language'] == 'Pushto'] = train[train['Language'] == 'Pushto'].replace(['Pushto'], 'Pashto')

In [None]:
set(train['Language'])

# **Solution #1:**

## #2: <ins>pre-processing</ins>

In [None]:
def simple_preProcess(X, code):
    
    if code == 'tfidf':
        vectorizer = TfidfVectorizer()
    elif code == 'countVec':
        vectorizer = CountVectorizer()
    else:
        raise ValueError('invalid value for code')
        
    df = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(X))

    return df, vectorizer.get_feature_names()

In [None]:
#both create/return the same dictionary
X_tfidf, dictionary = simple_preProcess(train['Text'], 'tfidf')
X_countVec, dictionary = simple_preProcess(train['Text'], 'countVec')

In [None]:
Y = train['Language']

<h2><B>#3: <u>preparing test data</u></B></h2>


In [None]:
def simple_test_preprocess(T, dictionary, code):
    
    if code == 'tfidf':
        vectorizer = TfidfVectorizer(vocabulary = dictionary)
    elif code == 'countVec':
        vectorizer = CountVectorizer(vocabulary = dictionary)
    else:
        raise ValueError('invalid value for code')

    df = pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(T))

    return df

In [None]:
test = pd.read_csv('../input/final-test/task1-final-withoutlabel-HoseinGhanbari.csv')

In [None]:
test

In [None]:
tfidf_preprocessed_test = simple_test_preprocess(test['Id'], dictionary, 'tfidf')
countVec_preprocessed_test = simple_test_preprocess(test['Id'], dictionary, 'countVec')

<h2><B>#4.0: <u>my KNN implementation</u></B></h2>

In [None]:
def euclidean_distance(X1, X2):
    return linalg.norm(X2-X1, axis=1)

In [None]:
def find_nearest_values(max_k, X_dataset, X_input):
    (samples_num, features_num) = X_dataset.shape
    tests_num = len(X_input)

    result = []

    for test_index in range(tests_num):
        
        distances = euclidean_distance(np.array(X_input.iloc[test_index]), np.array(X_dataset))

        # pick up first <k> neighbours 
        distances = bn.argpartition(distances, kth = max_k)
        knn_indices = distances[:max_k+1]

        result.append(knn_indices)
        
    return result

In [None]:
max_k = 6

In [None]:
def my_KNN_validation(X_train, X_validation, Y, y_validation):
    
    strt_time = time.time()
    print('validating ...')
    knn_indeices = find_nearest_values(max_k, X_train, X_validation)
    print('validation time : ', time.time() - strt_time)

    best_accuracy = 0
    best_k = 3
    for k in range(3, max_k+1):
        y_predict = Y[[stats.mode(knn_indeices[i][:k])[0].flatten()[0] for i in range(len(knn_indeices))]]
        accuracy = accuracy_score(y_validation, y_predict)
        print('k: ', k, ' - accuracy: ', accuracy)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k

    print('best k: ', best_k, best_accuracy)
    return best_k

In [None]:
def my_KNN_predict(X_train, X_test, Y, best_k):
    
    knn_indeices = find_nearest_values(max_k, X_train, X_test)
    T = Y[[stats.mode(knn_indeices[i][:best_k])[0].flatten()[0] for i in range(len(knn_indeices))]]
    return T

<h2><b>validation & finding the best k:</b></h2>

will take a long time ...

In [None]:
# X_new_train, X_validation, y_new_train, y_validation = train_test_split(X_tfidf, Y, test_size=0.10)

In [None]:
# best_k = my_KNN_validation(X_new_train, X_validation, Y, y_validation)

<h2><B>#4.1: <u>model</u></B></h2>


<h2><B>#4.1.1: <u>tfidf</u></B></h2>

In [None]:
SGD_model_tfidf = linear_model.SGDClassifier()
KNN_model_tfidf = neighbors.KNeighborsClassifier(5)
linSVM_model_tfidf = svm.LinearSVC(C=0.1)

In [None]:
SGD_model_tfidf.fit(X_tfidf, Y)

In [None]:
KNN_model_tfidf.fit(X_tfidf, Y)

In [None]:
linSVM_model_tfidf.fit(X_tfidf, Y)

<h2><B>#4.1.2: <u>CountVec</u></B></h2>

In [None]:
SGD_model_countVec = linear_model.SGDClassifier()
KNN_model_countVec = neighbors.KNeighborsClassifier(5)
linSVM_model_countVec = svm.LinearSVC(C=0.1)

In [None]:
SGD_model_countVec.fit(X_countVec, Y)

In [None]:
KNN_model_countVec.fit(X_countVec, Y)

In [None]:
linSVM_model_countVec.fit(X_countVec, Y)

<h2><B>#4.2: <u>prediction</u></B></h2>

<h2><B>#4.2.1: <u>tfidf</u></B></h2>

In [None]:
SGD_prediction_tfidf = SGD_model_tfidf.predict(tfidf_preprocessed_test)

In [None]:
KNN_prediction_tfidf = KNN_model_tfidf.predict(tfidf_preprocessed_test)

In [None]:
linSVM_prediction_tfidf = linSVM_model_tfidf.predict(tfidf_preprocessed_test)

In [None]:
my_KNN_prediction_tfidf = my_KNN_predict(X_tfidf, tfidf_preprocessed_test, Y, 3)

<h2><B>#4.2.2: <u>CountVec</u></B></h2>

In [None]:
SGD_prediction_countVec = SGD_model_countVec.predict(countVec_preprocessed_test)

In [None]:
KNN_prediction_countVec = KNN_model_countVec.predict(countVec_preprocessed_test)

In [None]:
linSVM_prediction_countVec = linSVM_model_countVec.predict(countVec_preprocessed_test)

<h2><B>#5: <u>output</u></B></h2>

<h2><B>#5.1: <u>tfidf</u></B></h2>

In [None]:
SGD_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : SGD_prediction_tfidf})

In [None]:
SGD_output_tfidf

In [None]:
SGD_output_tfidf.to_csv('./SGD_output_tfidf.csv', index=False)

In [None]:
KNN_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : KNN_prediction_tfidf})

In [None]:
KNN_output_tfidf.to_csv('./KNN_output_tfidf.csv', index=False)

In [None]:
linSVM_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : linSVM_prediction_tfidf})

In [None]:
linSVM_output_tfidf.to_csv('./linSVM_output_tfidf.csv', index=False)

In [None]:
linSVM_output_tfidf

In [None]:
my_KNN_T_tfidf = pd.DataFrame({'Category' : my_KNN_prediction_tfidf})
my_KNN_T_tfidf = my_KNN_T_tfidf.reset_index(drop = True)

In [None]:
my_KNN_output_tfidf = pd.DataFrame({'Id' : test['Id'], 'Category' : my_KNN_T_tfidf['Category']})

In [None]:
my_KNN_output_tfidf.to_csv('./my_KNN_output_tfidf.csv', index=False)

In [None]:
my_KNN_output_tfidf

<h2><B>#5.2: <u>countVec</u></B></h2>

In [None]:
SGD_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : SGD_prediction_countVec})

In [None]:
SGD_output_countVec

In [None]:
SGD_output_countVec.to_csv('./SGD_output_countVec.csv', index=False)

In [None]:
KNN_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : KNN_prediction_countVec})

In [None]:
KNN_output_countVec.to_csv('./KNN_output_countVec.csv', index=False)

In [None]:
linSVM_output_countVec = pd.DataFrame({'Id' : test['Id'], 'Category' : linSVM_prediction_countVec})

In [None]:
linSVM_output_countVec.to_csv('./linSVM_output_countVec.csv', index=False)

<h1><B>Solution #2:</B></h1>

In [None]:
persian_specs = {'گ'}
pashto_specs = {'ټ', 'څ', 'ځ', 'ډ', 'ړ', 'ږ', 'ښ', 'ګ' , 'ڼ'}
french_specs = {'É' , 'À', 'È' , 'Ù', 'Â', 'Ê', 'Î', 'Ô', 'Û', 'Ë', 'Ï', 'Ç', 'é', 'à', 'è', 'ù', 'â', 'ê', 'î', 'ô', 'û', 'ë', 'ï', 'ç' }
german_specs = {'ß', 'ä', 'ö', 'ü', 'Ä', 'Ö', 'Ü'}

In [None]:
def specs_ratio(entry, specs):
    splited = entry.split()
    words = [word for letter in specs for word in splited if letter in word]
    return len(words)/len(splited)

In [None]:
t1 = time.time()
Pers = test['Id'].apply (lambda row: specs_ratio(row, persian_specs))
Pash = test['Id'].apply (lambda row: specs_ratio(row, pashto_specs))
Germ = test['Id'].apply (lambda row: specs_ratio(row, german_specs))
Fren = test['Id'].apply (lambda row: specs_ratio(row, french_specs))
t2 = time.time()

In [None]:
t2 -t1

In [None]:
new_preprocessed_test = tfidf_preprocessed_test.copy()

In [None]:
new_preprocessed_test['Pers'] = Pers
new_preprocessed_test['Pash'] = Pash
new_preprocessed_test['Germ'] = Germ
new_preprocessed_test['Fren'] = Fren

In [None]:
new_preprocessed_test

In [None]:
t1 = time.time()
Pers = train['Text'].apply (lambda row: specs_ratio(row, persian_specs))
Pash = train['Text'].apply (lambda row: specs_ratio(row, pashto_specs))
Germ = train['Text'].apply (lambda row: specs_ratio(row, german_specs))
Fren = train['Text'].apply (lambda row: specs_ratio(row, french_specs))
t2 = time.time()

In [None]:
t2 - t1

In [None]:
new_preprocessed_X = X_tfidf.copy()

In [None]:
new_preprocessed_X['Pers'] = Pers
new_preprocessed_X['Pash'] = Pash
new_preprocessed_X['Germ'] = Germ
new_preprocessed_X['Fren'] = Fren

In [None]:
new_preprocessed_X

In [None]:
model = svm.LinearSVC(C=0.1)

In [None]:
type(new_preprocessed_X)

In [None]:
model.fit(new_preprocessed_X, Y)

In [None]:
pred = model.predict(new_preprocessed_test)

In [None]:
sol2_output = pd.DataFrame({'Id' : test['Id'], 'Category' : pred})

In [None]:
sol2_output.to_csv('./sol2_output.csv', index=False)