Imports of pandas, seaborn

In [None]:
import pandas as pd
import seaborn as sns
from itertools import islice
import nltk
from nameparser.parser import HumanName
import time
import csv
import math
import matplotlib.pyplot as plt

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.set_palette("hls")

Download necessary ressources

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

Read train dataset from csv file

In [None]:
df = pd.read_csv("../data/kaggle/train.csv")

In [None]:
df.head()

Statistical analysis about length of every "text" in dataset - For loop + Sorting by longest articles

Generate graph of average length of articles in 100 of characters

In [None]:
df["text_length"] = df["text"].apply(lambda x: math.ceil( len(str(x)) / 500))

In [None]:
ax = sns.countplot(x=df["text_length"])
ax.locator_params(axis='x', nbins=20)
ax.set(xlabel = 'Length of article(in 500s steps)', ylabel='Number of articles', title='Distribution of (all) article length')

Statistical analysis about length of every fake news (label = 1) in dataset - For loop + Sorting by longest articles

Generate graph of average length of articles in 100 of characters

In [None]:
ax = sns.countplot(x=df.loc[df["label"] == 1]["text_length"])
ax.locator_params(axis='x', nbins=20)
ax.set(xlabel = 'Length of article(in 500s steps)', ylabel='Number of articles', title='Distribution of (fake-news) article length')

Statistical analysis about length of every non fake news (label = 0) in dataset - For loop + Sorting by longest articles

Generate graph of average length of articles in 100 of characters

In [None]:
plt.close()
ax = sns.countplot(x=df.loc[df["label"] == 0]["text_length"])
ax.locator_params(axis='x', nbins=20)
ax.set(xlabel = 'Length of article(in 500s steps)', ylabel='Number of articles', title='Distribution (non-fake-news) of article length')

Check amount of names and save result to CSV file.

In [None]:
from tqdm.auto import tqdm
tqdm.pandas()

def get_human_names(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary = False)
    person_list = []
    person = []
    name = ""
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'):
        for leaf in subtree.leaves():
            person.append(leaf[0])
        if len(person) > 1: #avoid grabbing lone surnames
            for part in person:
                name += part + ' '
            if name[:-1] not in person_list:
                person_list.append(name[:-1])
            name = ''
        person = []

    return len(person_list)

start_time = time.time()

df["nb_names"] = df["text"].progress_apply(lambda x: get_human_names(str(x)))

f.close()

Generate graph for fake news.

In [None]:
df_names = pd.read_csv("../data/transformed/name_output.csv")

non_fake = {}
fake = {}

non_fake_sum = 0
fake_sum = 0

total_non_fake = 0
total_fake = 0

for index, row in df_names.iterrows():
    
    if df.loc[df["id"] == row["article_id"], "label"].to_string(index=False) == "1":

        if row["nb_names"] in fake:
            fake[row["nb_names"]]+=1
        else:
            fake[row["nb_names"]] = 1
        fake_sum += row["nb_names"]
        total_fake+=1

    else:
        if row["nb_names"] in non_fake:
            non_fake[row["nb_names"]]+=1
        else:
            non_fake[row["nb_names"]] = 1
        
        non_fake_sum += row["nb_names"]
        total_non_fake+=1

print(f"Non fake news amount: {total_non_fake}  -  Added up number of proper names: {non_fake_sum}  -  Average: {non_fake_sum/total_non_fake}")
print(f"Fake news amount: {total_fake}  -  Added up number of proper names: {fake_sum}  -  Average: {fake_sum/total_fake}")
print(f"All news amount: {total_fake+total_non_fake}  -  Added up number of all proper names: {fake_sum+non_fake_sum}  -  Average: {(fake_sum+non_fake_sum)/(total_fake+total_non_fake)}")


In [None]:
plt.close()
ax = sns.countplot(x=df_wrangled.loc[df_wrangled["label"] == 1]["nb_names"])
ax.locator_params(axis='x', nbins=20)
ax.set(xlabel = 'Number of names in article', ylabel='Number of articles', title='Distribution of article name usage in fake news')

Generate graph for non fake news.

In [None]:
plt.close()
ax = sns.countplot(x=df_wrangled.loc[df_wrangled["label"] == 0]["nb_names"])
ax.locator_params(axis='x', nbins=20)
ax.set(xlabel = 'Number of names in article', ylabel='Number of articles', title='Distribution of article name usage in non-fake news')

Amount of exclamation marks in fake/ non fake news

In [None]:
fake_exclamation_counter = 0
non_fake_exclamation_counter = 0

for index, row in df.loc[df['text'].str.contains(r'!') == True].iterrows():
    if row["label"] == 0:
        non_fake_exclamation_counter+=1
    else:
        fake_exclamation_counter+=1

print(f"Fake exclamation counter : {fake_exclamation_counter}")
print(f"Non Fake exclamation counter : {non_fake_exclamation_counter}")

Check word length in articles

In [29]:
def longer_than_(texte, nb):
    counter = 0
    for loop in str(texte).split(" "):
        if len(loop)>=nb:
            counter+=1
    return counter

word_length = {}
for number in range(1,20):
    word_length[number]=sum(df["text"].apply(lambda x: longer_than_(x, number)))

print(word_length)

{1: 15768797, 2: 15180728, 3: 12782209, 4: 9849455, 5: 7529657, 6: 5725692, 7: 4271272, 8: 2899080, 9: 1895428, 10: 1130369, 11: 633088, 12: 354174, 13: 193437, 14: 101077, 15: 53451, 16: 32009, 17: 20771, 18: 14872, 19: 10727}
