In [1]:
import pandas
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from collections import Counter
import math
import RAKE
from rake_nltk import Rake

stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
Rk = RAKE.Rake(RAKE.SmartStopList())

In [10]:
df = pandas.read_csv("../train/storyzy_en_train.tsv", sep="\t")
df_test = pandas.read_csv("../test1-full/storyzy_en_test1_full.tsv", sep="\t")
df = pandas.concat([df, df_test], ignore_index=True)
df_fake = df[df["type"] == "fakeNews"]
df_trusted = df[df["type"] == "trusted"]
df_satire = df[df["type"] == "satire"]

In [12]:
df['texts']  = df['title'].map(str) + df['text']
df['words'] = df.texts.apply(lambda doc: re.sub("[\W\d]", " ", doc.lower().strip()).split())
df['words_stem'] = df.words.apply(lambda word: [stemmer.stem(w) for w in word])
df['words_lem'] = df.words.apply(lambda word: [lemmatiser.lemmatize(w) for w in word])

In [13]:
#extracted keywords with rake

df["keyword_rake"]= df.texts.apply(lambda doc : [w[0] for w in Rk.run(doc.lower().strip(), maxWords = 1)])
df["keyword_rake_lem"]= df.words_lem.apply(lambda doc : [w[0] for w in Rk.run(" ".join(u for u in doc), maxWords = 1)])

In [14]:
#####extracted keywords with TFIDF + stemming
corpus_size = len(df)

#let's calculate the word frequencies for each document (Bag of words)
df['frequencies_stem'] = df.words_stem.apply(lambda words_stem: Counter(words_stem))

#cool, now we can calculate TF, the log+1 of the frequency of each word
df['log_frequencies_stem'] = df.frequencies_stem.apply(lambda d: dict([(k,math.log(v) + 1) for k, v in d.items()]))

#now let's build up a lookup list of document frequencies
#first we build a vocabulary for our corpus(set of unique words)
corpus_vocab = set([word for words in df.words_stem for word in words])

#now use the vocabulary to find the document frequency for each word
df_2 = lambda word: len(df[df.words_stem.apply(lambda w: word in w)])
corpus_vocab_dfs = dict([(word,math.log(corpus_size / df_2(word))) for word in corpus_vocab])


#phew! no let's put it all together. let's calculate tf*idf for each term
tfidf = lambda tfs: dict([(k,v * corpus_vocab_dfs[k]) for k, v  in tfs.items()])
df['tfidf_stem'] = df.log_frequencies_stem.apply(tfidf)

#finally we can grab the top 5 weighted terms to get keywords for each document
sorted(df.tfidf_stem[0], key=df.tfidf_stem[0].get, reverse=True)[0:50]
df['keywords_stem'] = df.tfidf_stem.apply(lambda t: sorted(t, key=t.get, reverse=True)[0:50])

In [15]:
#####extracted keywords with TFIDF + lemmatisation
corpus_size = len(df)

#let's calculate the word frequencies for each document (Bag of words)
df['frequencies_lem'] = df.words_lem.apply(lambda words_lem: Counter(words_lem))

#cool, now we can calculate TF, the log+1 of the frequency of each word
df['log_frequencies_lem'] = df.frequencies_lem.apply(lambda d: dict([(k,math.log(v) + 1) for k, v in d.items()]))

#now let's build up a lookup list of document frequencies
#first we build a vocabulary for our corpus(set of unique words)
corpus_vocab = set([word for words in df.words_lem for word in words])

#now use the vocabulary to find the document frequency for each word
df_2 = lambda word: len(df[df.words_lem.apply(lambda w: word in w)])
corpus_vocab_dfs = dict([(word,math.log(corpus_size / df_2(word))) for word in corpus_vocab])


#phew! no let's put it all together. let's calculate tf*idf for each term
tfidf = lambda tfs: dict([(k,v * corpus_vocab_dfs[k]) for k, v  in tfs.items()])
df['tfidf_lem'] = df.log_frequencies_lem.apply(tfidf)

#finally we can grab the top 5 weighted terms to get keywords for each document
sorted(df.tfidf_lem[0], key=df.tfidf_lem[0].get, reverse=True)[0:50]
df['keywords_lem'] = df.tfidf_lem.apply(lambda t: sorted(t, key=t.get, reverse=True)[0:50])

In [16]:
#####extracted keywords with TFIDF
corpus_size = len(df)

#let's calculate the word frequencies for each document (Bag of words)
df['frequencies'] = df.words.apply(lambda words: Counter(words))

#cool, now we can calculate TF, the log+1 of the frequency of each word
df['log_frequencies'] = df.frequencies.apply(lambda d: dict([(k,math.log(v) + 1) for k, v in d.items()]))

#now let's build up a lookup list of document frequencies
#first we build a vocabulary for our corpus(set of unique words)
corpus_vocab = set([word for words in df.words for word in words])

#now use the vocabulary to find the document frequency for each word
df_2 = lambda word: len(df[df.words.apply(lambda w: word in w)])
corpus_vocab_dfs = dict([(word,math.log(corpus_size / df_2(word))) for word in corpus_vocab])


#phew! no let's put it all together. let's calculate tf*idf for each term
tfidf = lambda tfs: dict([(k,v * corpus_vocab_dfs[k]) for k, v  in tfs.items()])
df['tfidf'] = df.log_frequencies.apply(tfidf)

#finally we can grab the top 5 weighted terms to get keywords for each document
sorted(df.tfidf_lem[0], key=df.tfidf_lem[0].get, reverse=True)[0:50]
df['keywords'] = df.tfidf.apply(lambda t: sorted(t, key=t.get, reverse=True)[0:50])

In [17]:
#save the file
df = df.drop(columns=['log_frequencies', 'frequencies', 'tfidf', 'frequencies_lem', 'log_frequencies_lem', 'tfidf_lem',
       'frequencies_stem', 'log_frequencies_stem', 'tfidf_stem'])
df.to_csv("../train/last_data_tfidf.csv")

In [None]:
#Nuage de mots

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter

df = pd.read_csv("../train/last_data_tfidf.csv")

In [None]:
all_keywords_fake = []
all_keywords_trusted = []
all_keywords_satire = []
for keyword in df.loc[df["type"] == "fakeNews", "keywords"]:
    all_keywords_fake += keyword[2:(len(keyword)-2)].split("', '")

for keyword in df.loc[df["type"] == "trusted", "keywords"]:
    all_keywords_trusted += keyword[2:(len(keyword)-2)].split("', '")
    
for keyword in df.loc[df["type"] == "satire", "keywords"]:
    all_keywords_satire += keyword[2:(len(keyword)-2)].split("', '")

In [None]:
count_fake = Counter(all_keywords_fake).most_common()
count_trusted = Counter(all_keywords_trusted).most_common()
count_satire = Counter(all_keywords_satire).most_common()

In [None]:
dict_fake = {}
for k, v in count_fake:
    dict_fake[k] = int(v) 
    
    
dict_trusted = {}
for k, v in count_trusted:
    dict_trusted[k] = int(v) 
    
dict_satire = {}
for k, v in count_satire:
    dict_satire[k] = int(v) 

In [None]:
wordcloud_fake = WordCloud().generate_from_frequencies(dict_fake)
wordcloud_trusted = WordCloud().generate_from_frequencies(dict_trusted)
wordcloud_satire = WordCloud().generate_from_frequencies(dict_satire)

In [None]:
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud_fake, interpolation='bilinear')
plt.axis("off")

In [None]:
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud_trusted, interpolation='bilinear')
plt.axis("off")

In [None]:
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud_satire, interpolation='bilinear')
plt.axis("off")