# Détection de langue (fr,en)

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
from pyspark.sql.functions import col
from pyspark.sql.functions import lit

In [3]:
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import *

In [4]:
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *

In [5]:
#https://www.kaggle.com/apachefranklin/multilingual-translation-experimentation/data?select=english_french.csv
df = spark.read.format('csv').options(header=True, inferSchema=True).load("dataset.csv")

In [6]:
#créer un dataframe qui contient la colonne "English"
df_en =df.select("English")
df_en=df_en.dropna()

In [7]:
#créer un dataframe qui contient la colonne "French"
df_fr =df.select("French")
df_fr=df_fr.dropna()

In [8]:
#renommer les colenne English et French par text
data_eng = df_en.select(col("English").alias("text"))
data_fr = df_fr.select(col("French").alias("text"))

In [9]:
#ajouter une colonne label qui egale 1 (classe anglais)
df_en= data_eng.withColumn("label", lit(1))

In [10]:
#ajouter une colonne label qui egale 0 (classe français)
df_fr = data_fr.withColumn("label", lit(0))

In [11]:
#on tokenise les deux ensembles
tokenizer_df_en = Tokenizer(inputCol="text", outputCol="text_token") 
token_df_en=tokenizer_df_en.transform(df_en)

In [12]:
tokenizer_df_fr = Tokenizer(inputCol="text", outputCol="text_token") 
token_df_fr=tokenizer_df_fr.transform(df_fr)

In [13]:
#supprimer les stopwords
stop_word_fr=[".",","," ","a","à","â","abord","afin","ah","ai","aie","ainsi","allaient","allo","allô","allons","après","assez","attendu","au","aucun","aucune","aujourd","aujourd'hui","auquel","aura","auront","aussi","autre","autres","aux","auxquelles","auxquels","avaient","avais","avait","avant","avec","avoir","ayant","b","bah","beaucoup","bien","bigre","boum","bravo","brrr","c","ça","car","ce","ceci","cela","comme","oui","le","avez","celle","celle-ci","celle-là","celles","celles-ci","celles-là","celui","celui-ci","celui-là","cent","cependant","certain","certaine","certaines","certains","certes","ces","cet","cette","ceux","ceux-ci","ceux-là","chacun","chaque","cher","chère","chères","chers","chez","chiche","chut","ci","cinq","cinquantaine","cinquante","cinquantième","cinquième","clac","clic","combien","comme","comment","compris","concernant","contre","couic","crac","d","da","dans","de","debout","dedans","dehors","delà","depuis","derrière","des","dès","désormais","desquelles","desquels","dessous","dessus","deux","deuxième","deuxièmement","devant","devers","devra","différent","différente","différentes","différents","dire","divers","diverse","diverses","dix","dix-huit","dixième","dix-neuf","dix-sept","doit","doivent","donc","dont","douze","douzième","dring","du","duquel","durant","e","effet","eh","elle","elle-même","elles","elles-mêmes","en","encore","entre","envers","environ","es","ès","est","et","etant","étaient","étais","était","étant","etc","été","etre","être","eu","euh","eux","eux-mêmes","excepté","f","façon","fais","faisaient","faisant","fait","feront","fi","flac","floc","font","g","gens","h","ha","hé","hein","hélas","hem","hep","hi","ho","holà","hop","hormis","hors","hou","houp","hue","hui","huit","huitième","hum","hurrah","i","il","ils","importe","j","je","jusqu","jusque","k","l","la","là","laquelle","las","le","lequel","les","lès","lesquelles","lesquels","leur","leurs","longtemps","lorsque","lui","lui-même","m","ma","maint","mais","malgré","me","même","mêmes","merci","mes","mien","mienne","miennes","miens","mille","mince","moi","moi-même","moins","mon","moyennant","n","na","ne","néanmoins","neuf","neuvième","ni","nombreuses","nombreux","non","nos","notre","nôtre","nôtres","nous","nous-mêmes","nul","o","o|","ô","oh","ohé","olé","ollé","on","ont","onze","onzième","ore","ou","où","ouf","ouias","oust","ouste","outre","p","paf","pan","par","parmi","partant","particulier","particulière","particulièrement","pas","passé","pendant","personne","peu","peut","peuvent","peux","pff","pfft","pfut","pif","plein","plouf","plus","plusieurs","plutôt","pouah","pour","pourquoi","premier","première","premièrement","près","proche","psitt","puisque","q","qu","quand","quant","quanta","quant-à-soi","quarante","quatorze","quatre","quatre-vingt","quatrième","quatrièmement","que","quel","quelconque","quelle","quelles","quelque","quelques","quelqu'un","quels","qui","quiconque","quinze","quoi","quoique","r","revoici","revoilà","rien","s","sa","sacrebleu","sans","sapristi","sauf","se","seize","selon","sept","septième","sera","seront","ses","si","sien","sienne","siennes","siens","sinon","six","sixième","soi","soi-même","soit","soixante","son","sont","sous","stop","suis","suivant","sur","surtout","t","ta","tac","tant","te","té","tel","telle","tellement","telles","tels","tenant","tes","tic","tien","tienne","tiennes","tiens","toc","toi","toi-même","ton","touchant","toujours","tous","tout","toute","toutes","treize","trente","très","trois","troisième","troisièmement","trop","tsoin","tsouin","tu","u","un","une","unes","uns","v","va","vais","vas","vé","vers","via","vif","vifs","vingt","vivat","vive","vives","vlan","voici","voilà","vont","vos","votre","vôtre","vôtres","vous","vous-mêmes","vu","w","x","y","z","zut","alors","aucuns","bon","devrait","dos","droite","début","essai","faites","fois","force","haut","ici","juste","maintenant","mine","mot","nommés","nouveaux","parce","parole","personnes","pièce","plupart","seulement","soyez","sujet","tandis","valeur","voie","voient","état","étions"]

In [14]:
cleaned_df_en=StopWordsRemover(inputCol="text_token", outputCol="text_cleaned")
df_en_cleaned=cleaned_df_en.transform(token_df_en)

In [15]:
cleaned_df_fr=StopWordsRemover(stopWords=stop_word_fr,inputCol="text_token", outputCol="text_cleaned")
df_fr_cleaned=cleaned_df_fr.transform(token_df_fr)

In [38]:
df_fr_cleaned.show()

+--------------------+-----+--------------------+--------------------+
|                text|label|          text_token|        text_cleaned|
+--------------------+-----+--------------------+--------------------+
|Reprise de la ses...|    0|[reprise, de, la,...|  [reprise, session]|
|Je déclare repris...|    0|[je, déclare, rep...|[déclare, reprise...|
|"Comme vous avez ...|    0|["comme, vous, av...|["comme, pu, cons...|
|Vous avez souhait...|    0|[vous, avez, souh...|[souhaité, débat,...|
|En attendant, je ...|    0|[en, attendant,, ...|[attendant,, souh...|
|Je vous invite à ...|    0|[je, vous, invite...|[invite, lever, m...|
|(Le Parlement, de...|    0|[(le, parlement,,...|[(le, parlement,,...|
|Madame la Préside...|    0|[madame, la, prés...|[madame, présiden...|
|Vous avez probabl...|    0|[vous, avez, prob...|[probablement, ap...|
|L'une des personn...|    0|[l'une, des, pers...|[l'une, vient, d'...|
|Ne pensez-vous pa...|    0|[ne, pensez-vous,...|[pensez-vous, pas...|
|Oui, 

In [16]:
#on divise le dataframe francais en ensemble de 
train_fr,test_fr = df_fr_cleaned.randomSplit([0.8,0.2])
train_en,test_en = df_en_cleaned.randomSplit([0.8,0.2])

In [17]:
#concatener les 2 dataframes d'entrainement
train= train_fr.union(train_en)

In [18]:
#concatener les 2 dataframes de test
test= test_fr.union(test_en)

In [19]:
train = train.withColumn("label",train["label"].cast(IntegerType()))
test = test.withColumn("label",test["label"].cast(IntegerType()))

In [20]:
ngram=NGram(n=2, inputCol="text_cleaned", outputCol="text_ngram")
ngram_train=ngram.transform(train)

In [21]:
ngram=NGram(n=2, inputCol="text_cleaned", outputCol="text_ngram")
ngram_test=ngram.transform(test)

In [22]:

hashingtf_train=HashingTF(inputCol="text_ngram", outputCol="text_hashing")
hashingtf_train.setNumFeatures(10000)
train_hash=hashingtf_train.transform(ngram_train)

In [23]:
hashingtf_test=HashingTF(inputCol="text_ngram", outputCol="text_hashing")
hashingtf_test.setNumFeatures(10000)
test_hash=hashingtf_test.transform(ngram_test)

In [24]:
idf_train = IDF(minDocFreq = 3, inputCol="text_hashing", outputCol="text_idf")
Model_train= idf_train.fit(train_hash)
train_idf = Model_train.transform(train_hash)

In [25]:
idf_test = IDF(minDocFreq = 3, inputCol="text_hashing", outputCol="text_idf")
Model_test = idf_test.fit(test_hash)
test_idf = Model_test.transform(test_hash)

In [26]:
train_final=train_idf.select("label","text_idf")
test_final=test_idf.select("label","text_idf")

In [27]:
lr = LogisticRegression(featuresCol="text_idf", labelCol="label")

In [28]:
training=lr.fit(train_final)

In [29]:
model_result=training.transform(test_final)

In [30]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label", metricName="areaUnderPR")

In [31]:
evaluator.evaluate(model_result)

0.8212704389201342