## Use scattertext

Voir issue : https://github.com/MRGotIdeas/twitter_fakenews/issues/6


In [2]:
import pandas as pd 
import pickle

import re
import spacy
import fr_core_news_md
import scattertext as sc

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
%cd ../../data

C:\Users\ManonRICHARD\Documents\PFE\twitter-fakenews\data


In [None]:
with open("data_tweets.txt", "rb") as file :  
    data_tweets = pickle.load(file)

data_tweets.head(5)

In [None]:
data_tweets[["liability", "liability_label"]].value_counts()

#### Création tag fake / pas fake 

In [None]:
data_tweets["fakenews"] = "fiable"
data_tweets.loc[data_tweets["liability"].isin([2, 3]), "fakenews"] = "pas fiable"

In [None]:
data_tweets = data_tweets.loc[data_tweets["liability_label"] != "site parodique"]

In [None]:
data_tweets.shape

## Nettoyage du texte



In [6]:
# on charge le modèle français de spacy
nlp = fr_core_news_md.load()

# On rajoute dans des stopwords à la liste de stopwords proposée par spacy
#nlp.Defaults.stop_words |= {"mlp","a","faut","faire","monsieur","rendez-vous","direct",
#                            "interview",r"invité\w+","dit","livetweet","suivez","celui",
#                            "ce","cette","emot_right_arrow"}

# nombre de stopwords 
len(nlp.Defaults.stop_words)

600

In [None]:
regexp_hashtags = re.compile(r"[#@]\w+")    # suppression des hashtags et @
regexp_link = re.compile(r"http\S+") # suppression des liens

In [None]:
def remove_punct(doc):
    return [token for token in doc if not token.is_punct]


def remove_stop_words(doc):
    return [token for token in doc if not token.is_stop]


def lemmatize(doc):
    return ' '.join([token.lemma_ for token in doc])

In [None]:
def preprocess_tweet(text, lemmatizing=False, delete_pos=False) : 

    '''Fonction permettant de nettoyer le texte. Elle renvoie un string (pas de tokenisation encore)'''
  
    #emojis_converted = convert_emojis(text)
    #french_flag_added = re.sub(regexp_frenchflag, "french_flag", emojis_converted)
    text_to_lower = text.lower().encode('utf-8').decode('utf-8')
    text_no_link = re.sub(regexp_link, "desident_link", text_to_lower)
    text_no_hastags = re.sub(regexp_hashtags, "desident_hashtag", text_no_link)
          
    # utilisation de spacy
    doc = nlp(text_no_hastags)
    removed_punct = remove_punct(doc)
    preprocessed_tweet = remove_stop_words(removed_punct)
    if lemmatizing :
        preprocessed_tweet = lemmatize(preprocessed_tweet)
            
    return(preprocessed_tweet)

In [None]:
# On peut alors nettoyer nos tweets, et créer une nouvelle colonne, text_preprocess
# cela peut prendre un peu de temps... 
data_tweets["tweet_preprocess"] = data_tweets["tweet"].apply(lambda tweet : preprocess_tweet(tweet))

In [None]:
data_tweets["tweet_preprocess"] = data_tweets["tweet_preprocess"].apply(lambda list_txt : " ".join(str(v) for v in list_txt))

In [None]:
data_tweets["tweet_preprocess"][0]

In [None]:
# On regarde le résultat du nettoyage du texte
pd.set_option("max_colwidth", None)
data_tweets[["tweet", "tweet_preprocess"]].head(5)

In [None]:
data_tweets.shape

In [None]:
with open("data_tweets_preprocess.txt", "wb") as fp :   #Pickling
    pickle.dump(data_tweets, fp)

In [None]:
with open("data_tweets_preprocess.txt", "rb") as file :  
    data_tweets = pickle.load(file)

data_tweets.head(5)

### Filtre et échantillonnage

In [72]:
# On enlève les tweets anglais / italiens
# -> ont une description qui indique que c'est un média anglais ou américain
data_tweets = data_tweets[~data_tweets["description"].str.contains('anglais|américain|italien', regex= True, na=False)]
data_tweets = data_tweets[~data_tweets["tweet_preprocess"].str.contains("don' t", regex= True, na=False)]
data_tweets.shape

(759098, 13)

In [73]:
# On enlève les lignes de la base de données où twitter nous informe que le compte a été supprimé 
# 7 comptes dans la base de données ont été supprimés à cause de lois françaises
data_tweets = data_tweets[~data_tweets["tweet"].str.contains(r'local law', regex= True, na=False)]
data_tweets.shape

(759098, 13)

In [74]:
N = 10000
RANDOM_SEED = 34564324

sample_tweets_fk = data_tweets.loc[data_tweets["fakenews"]=="fiable"].sample(n=N, random_state=RANDOM_SEED)
sample_tweets_nfk = data_tweets.loc[data_tweets["fakenews"]=="pas fiable"].sample(n=N, random_state=RANDOM_SEED)

sample_tweets = pd.concat([sample_tweets_fk, sample_tweets_nfk], ignore_index=True)

## Scattertext


- Mettre les mots en minuscule
- Transformation des liens internet en "desident_link"
- transformation des ! en "desident_exclamation"
- transformation des ? en "desident_question"
- Suppression des stop words (vous pouvez utiliser spacy)
- Suppression de la ponctuaction
- lemmatisation et sans lemmatisation (dans un premier temps, lancer scattertext sans, puis avec ensuite

In [75]:
# on crée un objet corpus pour scattertext
corpus = sc.CorpusFromPandas(data_frame = sample_tweets,
                             category_col = "fakenews",
                             text_col = "tweet_preprocess",
                             nlp = nlp).build()#.compact(sc.AssociationCompactor(4000))

In [76]:
# On crée le html du scattertext
html = sc.produce_scattertext_explorer(  corpus
                                       , category                  = 'pas fiable'
                                       , category_name             = 'pas fiable'
                                       , not_category_name         = 'fiable'
                                       , minimum_term_frequency    = 10
                                       , pmi_threshold_coefficient = 1
                                       , term_ranker               = sc.AbsoluteFrequencyRanker
                                       , transform                 = sc.Scalers.dense_rank
                                       , term_scorer               = sc.RankDifference() 
#on peut égalemet tester le term_scorer ScaledFscore : st.ScaledFScorePresets(beta=1, one_to_neg_one=True)
                                       , width_in_pixels           = 1000
                                       )

# On enregistre le html
open("tweets_visualisation.html", 'wb').write(html.encode('utf-8'))

3210272