In [1]:
! pip install plotly



In [2]:
! pip install --user pyenchant



In [3]:
import re

import gensim
from gensim.models import word2vec

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import html

from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
from plotly.subplots import make_subplots
import enchant




In [4]:
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jurus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jurus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
#DATA

df_train = pd.read_csv("C:/Users/jurus/Documents/AGH/Praca inżynierska/data_2/train.csv")
df_test = pd.read_csv("C:/Users/jurus/Documents/AGH/Praca inżynierska/data_2/test.csv")
df_labels = pd.read_csv("C:/Users/jurus/Documents/AGH/Praca inżynierska/data_2/submit.csv")

df_test['label'] = df_labels['label']

df = pd.concat([df_train, df_test]).reset_index(drop=True)

df.drop(columns=['author'], inplace=True)

# Combining 'title' and 'text' colmuns together
df['original'] = df['title'] + ' ' + df['text']

# Shuffling data
df = df.sample(frac=1)
# Dropping duplicates & NaN rows
df.drop_duplicates(subset=['text'], inplace=True)
df.dropna(subset = ["text"], inplace=True)
df.dropna(subset = ["title"], inplace=True)


In [6]:
#DATA CLEANING

# Obtaining additional stopwords from nltk
stop_words = stopwords.words('english')

def cleaning(text):
    clean = re.sub('<.*?>', ' ', str(text))         
#removes HTML tags
    clean = re.sub('\'.*?\s',' ', clean)               
#removes all hanging letters afer apostrophes (s in it's)
    clean = re.sub(r'http\S+',' ', clean)              
#removes URLs
    clean = re.sub('\W+',' ', clean)                   
#replacing the non alphanumeric characters
    return html.unescape(clean)
df['cleaned'] = df['original'].apply(cleaning)


def stopwords(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        # Taking words that don't belong to stopwords and have more than 2 characters
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)

    return result
df['nostopwords'] = df['cleaned'].apply(stopwords)

In [7]:
# Lemmatizing
wnl = WordNetLemmatizer()
df['clean_lemm'] = df['nostopwords'].apply(lambda x: [wnl.lemmatize(word, pos="v") for word in x])
df['clean_lemm'] = df['clean_lemm'].apply(lambda x: [wnl.lemmatize(word, pos="a") for word in x])
df['clean_lemm'] = df['clean_lemm'].apply(lambda x: [wnl.lemmatize(word, pos="n") for word in x])

In [13]:
# All unique words present in dataset in one string
df['clean_joined'] = df['clean_lemm'].apply(lambda x: " ".join(x))

# Number of tokens in each article
df['len'] = df.clean_joined.str.replace(',','').str.split().str.len()

# Extracting fake and real new to seperate dataframes
df_true = df.loc[(df['label'] == 0)]
df_fake = df.loc[(df['label'] == 1)]

In [14]:
#NUMBER OF TOKENS IN EACH DATASET

fig = go.Figure()
fig.add_trace(go.Box(y=list(df_fake['len']), name='Fake', marker_color = 'indianred'))
fig.add_trace(go.Box(y=list(df_true['len']), name = 'Real', marker_color = 'lightseagreen'))

fig.update_layout({'title': 'All tokens'})
fig.show()

In [21]:
#COUNT ONLY UNIQUE TOKENS IN BOTH DATASETS

def unique_tokens(df):
    unique_tokens = set()
    for clean_joined in tqdm(df['clean_joined']):
        splited = clean_joined.split()
        for token in splited:
            unique_tokens.add(token)
    return unique_tokens

unique_tokens_fake = unique_tokens(df_fake)
unique_tokens_true = unique_tokens(df_true)

100%|██████████████████████████████████████████████████████████████████████████| 12098/12098 [00:01<00:00, 7371.62it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12704/12704 [00:01<00:00, 7620.00it/s]


In [24]:
fig = px.bar(y=[len(unique_tokens_fake), len(unique_tokens_true)], x=['Fake', 'True'], title='Unique tokens')
fig.show()

In [26]:
#REVELANCE OF WORDS

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

list_fake = get_top_n_words(df_fake['clean_joined'], 25)
list_true = get_top_n_words(df_true['clean_joined'], 25)

In [27]:
new_list_words = [ seq[0] for seq in list_fake ]
new_list_values = [ seq[1] for seq in list_fake ]

fig = go.Figure()
fig.add_trace(go.Bar(y=new_list_values,  x=new_list_words, marker_color='indianred'))
fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        'title': 'Fake news frequency words'
})
fig.show()

In [28]:
new_list_words = [ seq[0] for seq in list_true ]
new_list_values = [ seq[1] for seq in list_true ]

fig = go.Figure()
fig.add_trace(go.Bar(y=new_list_values, 
                         x=new_list_words, 
                        marker_color='lightseagreen'
))
fig.update_layout({
        'plot_bgcolor': 'rgba(0, 0, 0, 0)',
        'paper_bgcolor': 'rgba(0, 0, 0, 0)',
        'title': 'True news frequency words'
})
fig.show()