In [144]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import nltk,string
from nltk import word_tokenize,PorterStemmer,LancasterStemmer,SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pyspark.ml.feature  as feat
from pyspark.ml import Pipeline

In [145]:
schema = StructType([
StructField("sentences", StringType(),True),
StructField("sentiments", StringType(),True)
])

spark = SparkSession.builder.master("local[*]").appName("test").getOrCreate()
rdd = spark.sparkContext.textFile("data_file.txt")

In [146]:
rdd = rdd.map(lambda x:x.split("\t"))

In [147]:
df = rdd.toDF(['sentences','sentiments'])

In [148]:
df.toPandas().head(5)

Unnamed: 0,sentences,sentiments
0,: Jordan Bardella parle beaucoup de JP. Delevo...,1
1,": Pour Sandra Rigot, Maîtresse de conférences ...",1
2,: Réforme des retraites : la Macronie à son hy...,0
3,: DIRECT. Grève contre la réforme des retraite...,0
4,: Grève contre la réforme des retraites : les ...,0


In [149]:
@F.udf(StringType())
def process_text(text):    
    tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
    stemmer_snowball = SnowballStemmer('french')
    stopword = set(stopwords.words('french'))
    sentence = tokenizer.tokenize(text)
    sentence = " ".join([sent.lower() for sent in sentence])
    clean_words = [word for word in nltk.word_tokenize(sentence) if word not in stopword.union(string.punctuation)]
    clean_words = [stemmer_snowball.stem(lem) for lem in clean_words]
    clean_words = " ".join(clean_words)
    return clean_words


In [150]:
df = df.select("sentences","sentiments").withColumn("wordSentences",process_text("sentences"))\
.withColumn("label", F.col("sentiments").cast(DoubleType()))

In [151]:
df.toPandas().head(5)

Unnamed: 0,sentences,sentiments,wordSentences,label
0,: Jordan Bardella parle beaucoup de JP. Delevo...,1,jordan bardel parl beaucoup jp delevoy ’ aim ’...,1.0
1,": Pour Sandra Rigot, Maîtresse de conférences ...",1,sandr rigot maîtress conférent économ réform r...,1.0
2,: Réforme des retraites : la Macronie à son hy...,0,réform retrait macron hymn écout don `` voix '' …,0.0
3,: DIRECT. Grève contre la réforme des retraite...,0,direct grev contr réform retrait `` chacun pre...,0.0
4,: Grève contre la réforme des retraites : les ...,0,grev contr réform retrait transport encor pert...,0.0


In [152]:
df = df.select("sentences","wordSentences","label").withColumn("words",F.split(F.col("wordSentences"),' '))
df = df.select("sentences","words","label")
df.toPandas().head(5)

Unnamed: 0,sentences,words,label
0,: Jordan Bardella parle beaucoup de JP. Delevo...,"[jordan, bardel, parl, beaucoup, jp, delevoy, ...",1.0
1,": Pour Sandra Rigot, Maîtresse de conférences ...","[sandr, rigot, maîtress, conférent, économ, ré...",1.0
2,: Réforme des retraites : la Macronie à son hy...,"[réform, retrait, macron, hymn, écout, don, ``...",0.0
3,: DIRECT. Grève contre la réforme des retraite...,"[direct, grev, contr, réform, retrait, ``, cha...",0.0
4,: Grève contre la réforme des retraites : les ...,"[grev, contr, réform, retrait, transport, enco...",0.0


In [153]:
tf = feat.HashingTF(inputCol="words", outputCol="rawFeatures")
idf = feat.IDF(inputCol="rawFeatures",outputCol="features")
pipelineTFIDF = Pipeline(stages=[tf,idf])
pipelineFit = pipelineTFIDF.fit(df)
df = pipelineFit.transform(df)

df.show()


+--------------------+--------------------+-----+--------------------+--------------------+
|           sentences|               words|label|         rawFeatures|            features|
+--------------------+--------------------+-----+--------------------+--------------------+
|: Jordan Bardella...|[jordan, bardel, ...|  1.0|(262144,[12478,40...|(262144,[12478,40...|
|: Pour Sandra Rig...|[sandr, rigot, ma...|  1.0|(262144,[50339,54...|(262144,[50339,54...|
|: Réforme des ret...|[réform, retrait,...|  0.0|(262144,[27160,13...|(262144,[27160,13...|
|: DIRECT. Grève c...|[direct, grev, co...|  0.0|(262144,[2685,806...|(262144,[2685,806...|
|: Grève contre la...|[grev, contr, réf...|  0.0|(262144,[50826,66...|(262144,[50826,66...|
|: Le journal de q...|[journal, démont,...|  0.0|(262144,[49759,16...|(262144,[49759,16...|
|: Edouard Philipp...|[edouard, philipp...|  0.0|(262144,[28460,67...|(262144,[28460,67...|
|" Mardi, sortez !...|[``, mard, sort, ...|  0.0|(262144,[64353,13...|(262144,[6

In [154]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression()

In [155]:
logregModel = logreg.fit(df)

In [156]:
#logregModel.save('model_nlp')