In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import LinearSVC
import pickle

In [2]:
#load many texts in french and english
df = pd.read_csv("../data/fr_eng.csv", index_col = False, sep = ';')
print(len(df.dropna()))
df.head()

25000


Unnamed: 0,english,french
0,"this film was probably the best ""scary film"" i...",THE RINGMASTER interprète Jerry Springer comme...
1,Only if you are crazy about Amber Smith should...,J'ai 14 ans et j'adore ce dessin animé. Burt R...
2,Every Saturday morning at 11 a.m. I watched Su...,"Quand j'ai commencé à regarder ""Fay Grim"", je ..."
3,The four LA cops in fedoras driving around in ...,"""Masters of Horror"" s'est avéré être une médio..."
4,Comparing this movie to anything by Almodovar ...,C'est un grand film britannique. Un script hab...


In [3]:
#Get labels (dummies) for french and english texts
fr = df['french'].to_frame().rename({'french': 'text'}, axis = 1).copy()
fr['lang'] = 0
print(len(fr))
fr.head()

25000


Unnamed: 0,text,lang
0,THE RINGMASTER interprète Jerry Springer comme...,0
1,J'ai 14 ans et j'adore ce dessin animé. Burt R...,0
2,"Quand j'ai commencé à regarder ""Fay Grim"", je ...",0
3,"""Masters of Horror"" s'est avéré être une médio...",0
4,C'est un grand film britannique. Un script hab...,0


In [4]:
eng = df['english'].to_frame().rename({'english': 'text'}, axis = 1).copy()
eng['lang'] = 1
print(len(eng))
eng.head()

25000


Unnamed: 0,text,lang
0,"this film was probably the best ""scary film"" i...",1
1,Only if you are crazy about Amber Smith should...,1
2,Every Saturday morning at 11 a.m. I watched Su...,1
3,The four LA cops in fedoras driving around in ...,1
4,Comparing this movie to anything by Almodovar ...,1


In [5]:
#group and shuffle french and english texts
del df
df = pd.concat([fr, eng]).sample(frac = 1)
del fr, eng
print(len(df))
df.head()

50000


Unnamed: 0,text,lang
13406,"À l'exception du son, aucun des commentaires c...",0
21555,Fame was released in the U.S. a year before I ...,1
22556,This movie is about 3 stories put together rev...,1
15543,"Oui, même en tant que fan de spectacles comme ...",0
12764,"Comme la plupart des gens, j'ai été intrigué q...",0


In [6]:
#split texts and labels into test and train samples
x_train, x_true, y_train, y_true = train_test_split(df["text"], df['lang'], shuffle = False)

#configure the classifier model
model = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf-svm', LinearSVC())])

#train the model
model.fit(x_train, y_train)

#test the model with the same dataset
pred = model.predict(x_true)
print("ratio de similitude: " + str(np.mean(pred == y_true) * 100)[:5] + '%')

ratio de similitude: 100.0%


In [7]:
#load a new dataset for testing
test = pd.read_csv("../data/artificial_intelligence_sample.csv", index_col = False, usecols = ['resume'])
print(len(test))
test.head()

210


Unnamed: 0,resume
0,Artificial Intelligence est une série de huit ...
1,Artificial intelligence is intelligence demons...
2,Why do we need research to ensure that artific...
3,Artificial intelligence (AI) is an area of com...
4,Artificial intelligence is the simulation of h...


In [8]:
def get_language(x):
    """assign a label corresponding to the prediction"""
    if x == 1:
        return "english"
    return "french"

In [9]:
#test the model with the new dataset
pred = model.predict(test['resume'])
test['pred'] = pred
test['language'] = test['pred'].apply(get_language)
test = test.drop("pred", axis = 1)

In [10]:
#show the results
fr = test[test['language'] == "french"].copy()
print(len(fr))
fr.head(10)

21


Unnamed: 0,resume,language
0,Artificial Intelligence est une série de huit ...,french
22,"Séquence intitulée 'Artificial intellligence',...",french
64,Français. Résumé. Introduction aux techniques ...,french
76,artificial intelligence - Traduction Anglais-F...,french
93,20 févr. 2018 - Symposium de l'Académie des sc...,french
119,A.I. Intelligence artificielle est un film réa...,french
135,Livraison en 1 jour ouvré gratuite possible po...,french
138,10 mai 2018 - Découvrez l'avis des employés co...,french
139,"Tous les articles avec le sujet ""artificial in...",french
142,Artificial Intelligence and Dynamic Systems fo...,french


In [11]:
eng = test[test['language'] == "english"].copy()
print(len(eng))
eng.head(10)

189


Unnamed: 0,resume,language
1,Artificial intelligence is intelligence demons...,english
2,Why do we need research to ensure that artific...,english
3,Artificial intelligence (AI) is an area of com...,english
4,Artificial intelligence is the simulation of h...,english
5,Will artificial intelligence give us human-lik...,english
6,"Current AI is impressive, but it's not intelli...",english
7,"Learn artificial intelligence basics, includin...",english
8,"Artificial Intelligence, which commenced publi...",english
9,artificial intelligence \ˌɑɹ.tɪ.ˌfɪʃ.əl ɪn.ˈtɛ...,english
10,Free online course in AI from Columbia Univers...,english


In [None]:
#Save the model
with open("../data/language_model.pickle", "wb") as fd:
    pickle.Pickler(fd).dump(model)