In [1]:
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
from pyspark.sql.functions import col
from pyspark.sql.functions import lit

In [3]:
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import *

In [4]:
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *

In [5]:
from pyspark.ml import Pipeline

In [6]:
dataset1 =spark.read.format('csv').options(header=True, inferSchema=True).load("dataset.csv")

In [7]:
df_en =dataset1.select("English")
df_en=df_en.dropna()

In [8]:
df_fr =dataset1.select("French")
df_fr=df_fr.dropna()

In [9]:
data_eng = df_en.select(col("English").alias("text"))
data_fr = df_fr.select(col("French").alias("text"))

In [10]:
df_en= data_eng.withColumn("label", lit(1))
df_fr = data_fr.withColumn("label", lit(0))

In [11]:
#on tokenise les deux ensembles
tokenizer_df_en = Tokenizer(inputCol="text", outputCol="text_token") 
token_df_en=tokenizer_df_en.transform(df_en)

In [12]:
tokenizer_df_fr = Tokenizer(inputCol="text", outputCol="text_token") 
token_df_fr=tokenizer_df_fr.transform(df_fr)

In [13]:
import nltk.corpus 
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword_en= stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
nltk.download('stopwords')
stopword_fr= stopwords.words('french')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
cleaned_df_en=StopWordsRemover(stopWords=stopword_en, inputCol="text_token", outputCol="text_cleaned")
df_en_cleaned=cleaned_df_en.transform(token_df_en)

In [16]:
cleaned_df_fr=StopWordsRemover(stopWords=stopword_fr,inputCol="text_token", outputCol="text_cleaned")
df_fr_cleaned=cleaned_df_fr.transform(token_df_fr)

In [17]:
#on divise le dataframe francais en ensemble de 
train_fr,test_fr = df_fr_cleaned.randomSplit([0.8,0.2])
train_en,test_en = df_en_cleaned.randomSplit([0.8,0.2])

In [18]:
#concatener les 2 dataframes d'entrainement
train= train_fr.union(train_en)

In [19]:
#concatener les 2 dataframes de test
test= test_fr.union(test_en)

In [20]:
train = train.withColumn("label",train["label"].cast(IntegerType()))
test = test.withColumn("label",test["label"].cast(IntegerType()))

In [21]:
ngram=NGram(n=2, inputCol="text_cleaned", outputCol="text_ngram")
ngram_train=ngram.transform(train)

In [22]:
ngram=NGram(n=2, inputCol="text_cleaned", outputCol="text_ngram")
ngram_test=ngram.transform(test)

In [23]:
hashingtf_train=HashingTF(inputCol="text_ngram", outputCol="text_hashing")
hashingtf_train.setNumFeatures(10000)
train_hash=hashingtf_train.transform(ngram_train)

In [24]:
hashingtf_test=HashingTF(inputCol="text_ngram", outputCol="text_hashing")
hashingtf_test.setNumFeatures(10000)
test_hash=hashingtf_test.transform(ngram_test)

In [25]:
idf_train = IDF(minDocFreq = 3, inputCol="text_hashing", outputCol="text_idf")
Model_train= idf_train.fit(train_hash)
train_idf = Model_train.transform(train_hash)

In [26]:
idf_test = IDF(minDocFreq = 3, inputCol="text_hashing", outputCol="text_idf")
Model_test = idf_test.fit(test_hash)
test_idf = Model_test.transform(test_hash)

In [27]:
train_final=train_idf.select("label","text_idf")
test_final=test_idf.select("label","text_idf")

In [28]:
lr = LogisticRegression(featuresCol="text_idf", labelCol="label")

In [29]:
training=lr.fit(train_final)

In [30]:
model_result=training.transform(test_final)

In [31]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="label", metricName="areaUnderPR")

In [32]:
evaluator.evaluate(model_result)

0.8481775618172201

In [37]:
model_result.groupby('rawPrediction').count().collect()

[Row(rawPrediction=DenseVector([1.164, -1.164]), count=1),
 Row(rawPrediction=DenseVector([-0.531, 0.531]), count=1),
 Row(rawPrediction=DenseVector([-0.4272, 0.4272]), count=1),
 Row(rawPrediction=DenseVector([-1.0806, 1.0806]), count=1),
 Row(rawPrediction=DenseVector([-0.7222, 0.7222]), count=1),
 Row(rawPrediction=DenseVector([3.9618, -3.9618]), count=1),
 Row(rawPrediction=DenseVector([2.3382, -2.3382]), count=1),
 Row(rawPrediction=DenseVector([0.6853, -0.6853]), count=1),
 Row(rawPrediction=DenseVector([2.1922, -2.1922]), count=1),
 Row(rawPrediction=DenseVector([-0.5199, 0.5199]), count=1),
 Row(rawPrediction=DenseVector([-0.9744, 0.9744]), count=1),
 Row(rawPrediction=DenseVector([8.3248, -8.3248]), count=1),
 Row(rawPrediction=DenseVector([3.9352, -3.9352]), count=1),
 Row(rawPrediction=DenseVector([-0.8477, 0.8477]), count=1),
 Row(rawPrediction=DenseVector([-0.0003, 0.0003]), count=1),
 Row(rawPrediction=DenseVector([-0.6526, 0.6526]), count=1),
 Row(rawPrediction=DenseVect

sauvegarder le modele

In [35]:
training.save("\model_lr")