In [1]:
import pandas as pd

In [2]:
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )



Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [3]:
# split the data in chunks and run in parallel
from joblib import Parallel, delayed
from os import cpu_count

# run in parallel
def run_parallel(df, n_jobs, func):
    # call every element in the chunks in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(func)(element) for element in df)
    return results

In [4]:
# clean the text
def clean_column(df):
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, clean_text)
    # replace column with cleaned text
    return results

In [5]:
from nltk.tokenize import word_tokenize
# tokenize the text. run in parallel
def tokenize_column(df):
    # run the function on the data
    n_jobs = cpu_count()
    results = run_parallel(df['content'], n_jobs, word_tokenize)
    
    return results

In [6]:
from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))

    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results

In [43]:
# loading a small part of the data for testing
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', nrows=10000, index_col=0)

In [8]:
from os import path
from os import remove
# create a file to be used for storing the data
def intialize_file(name):
    # check if file exists
    if path.exists(name):
        # if the file exists, delete it
        remove(name)
    # create the file
    with open(name, 'w') as f:
        f.write('') # write an empty string to the file to create it

In [9]:
# append data to csv file
def append_to_file(name, data):
    with open(name, 'a') as f:
        # write the data to the file
        f.write(data)

In [10]:
# create a file to store the data
tokenized_file = 'tokenized_temp.csv'
intialize_file(tokenized_file)

# append header to the file
header = df.columns.values
append_to_file(tokenized_file, ','.join(header))
append_to_file(tokenized_file, '\n')

In [11]:
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', chunksize=1000, index_col=0)

count = 0
for chunk in df:
    # process data
    chunk['content'] = clean_column(chunk['content'])
    chunk['content'] = tokenize_column(chunk)
    chunk['content'] = remove_stopwords(chunk['content'])
    # append data to file
    chunk.to_csv(tokenized_file, mode='a', header=False)

    count += 1
    print(count, 'chunks processed')
    if count == 10:
        break


1 chunks processed
2 chunks processed
3 chunks processed
4 chunks processed
5 chunks processed
6 chunks processed
7 chunks processed
8 chunks processed
9 chunks processed
10 chunks processed


In [42]:
# load from file
tokenized_file = 'tokenized_temp.csv'
df = pd.read_csv(tokenized_file)
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [51]:

# count the tokens
def count(s):
    return Counter(s)

# count token frequency
from collections import Counter
def count_tokens(df):


    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, count)

    # total token frequency
    total = {}
    for list in results:
        for k,v in list.items():
            if k in total:
                total[k] += v
            else:
                total[k] = v

    return total

In [52]:
token_freq = count_tokens(df['content'])

In [53]:
# sort the tokens by frequency
token_freq = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)

In [77]:
# creating a set of stopwords from the frequency list
stop_words = set()

In [78]:
# remove top 10 tokens and add them to the stopword list
for i in range(10):
    stop_words.add(token_freq[i][0])

token_freq = token_freq[10:]


In [79]:

# find the 80% percentile
# first finding the total number of tokens
total = 0
for token in token_freq:
    total += token[1]
# then finding the 80% percentile
percentile = int(total * 0.8)
# then finding the token that corresponds to the 80% percentile
total = 0
for token in token_freq:
    total += token[1]
    if total > percentile:
        index = token_freq.index(token)
        break
# add all tokens after the 80% percentile to the stopword list
for token in token_freq[index:]:
    stop_words.add(token[0])

In [80]:
len(stop_words)

109435

In [86]:
# remove custom stopwords from the text
def remove_costume_stopwords(df):
    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    for i in range(len(df)):
        df[i] = r(df[i])

    return df


In [87]:
df['content'] = remove_costume_stopwords(df['content'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = r(df[i])


In [88]:
# save the data
df.to_csv('ready_data.csv', index=False, header=True)

In [89]:
df = pd.read_csv('ready_data.csv')
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [123]:
# split the data into train, test and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['type'], test_size=0.2, random_state=42)
X_test , X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [124]:
y_test.value_counts()*8 / y_train.value_counts()

bias          1.190083
clickbait     1.027027
conspiracy    1.001184
fake          1.014581
hate          1.025641
junksci       0.709957
political     1.006693
reliable      1.006289
rumor         1.674419
satire        1.440000
unknown       1.263158
unreliable    0.970297
Name: type, dtype: float64

In [125]:
# making labels binary
y_train = y_train.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_test = y_test.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_val = y_val.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)

In [129]:
import numpy as np
# function which pads pandas series
def pad_series(s, max_len):
    # truncate the series if it is longer than max_len
    s = s.apply(lambda x: x[:max_len])

    # pad the series
    s = s.apply(lambda x: x + ['<pad>'] * (max_len - len(x)))

    # convert the series to np array
    s = np.array(s.tolist())
    return s

# pad the series and convert them to np arrays
X_test = pad_series(X_test, 1000)
X_train = pad_series(X_train, 1000)
X_val = pad_series(X_val, 1000)

In [133]:
# converting the strings to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# fit the encoder on the training data
le.fit(X_train.flatten())

# transform the data
X_train = le.transform(X_train.flatten()).reshape(X_train.shape)
X_test = le.transform(X_test.flatten()).reshape(X_test.shape)
X_val = le.transform(X_val.flatten()).reshape(X_val.shape)


In [130]:
# ratio of fake news in the data
y_train.value_counts() / len(y_train)

0    0.53075
1    0.46925
Name: type, dtype: float64

In [131]:
# making a baseline model
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)


0.541

In [137]:
# making a baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = LogisticRegression(max_iter=10000)
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)

0.619

In [145]:
# baseline model
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = MLPClassifier(max_iter=10000, hidden_layer_sizes=(1000,200,))
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)


0.649

Making a vocab file:

In [82]:
# creating a vocabulary
vocab = {}
for token in token_freq:
    if token[0] not in stop_words:
        vocab[token[0]] = len(vocab)
# save the vocabulary
import pickle
with open('vocab.pickle', 'wb') as f:
    pickle.dump(vocab, f, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
# load the vocabulary
import pickle
with open('vocab.pickle', 'rb') as f:
    vocab = pickle.load(f)

vocab

{'(': 0,
 "n't": 1,
 '?': 2,
 'one': 3,
 'people': 4,
 '!': 5,
 'trump': 6,
 'would': 7,
 'us': 8,
 'new': 9,
 'said': 10,
 'government': 11,
 'president': 12,
 'time': 13,
 'also': 14,
 "'": 15,
 'like': 16,
 'obama': 17,
 'world': 18,
 ';': 19,
 'even': 20,
 'many': 21,
 'could': 22,
 'years': 23,
 '-': 24,
 'may': 25,
 'first': 26,
 'two': 27,
 '%': 28,
 '$': 29,
 'state': 30,
 'states': 31,
 'american': 32,
 'blockchain': 33,
 'america': 34,
 'get': 35,
 'u.s.': 36,
 'way': 37,
 'year': 38,
 'url': 39,
 'right': 40,
 'make': 41,
 'know': 42,
 'next': 43,
 'united': 44,
 'think': 45,
 '&': 46,
 'see': 47,
 'free': 48,
 'much': 49,
 'well': 50,
 'life': 51,
 'last': 52,
 'country': 53,
 'news': 54,
 'fact': 55,
 'back': 56,
 'day': 57,
 'take': 58,
 'political': 59,
 'bitcoin': 60,
 'every': 61,
 'war': 62,
 'since': 63,
 'need': 64,
 'power': 65,
 "'re": 66,
 'public': 67,
 'good': 68,
 'media': 69,
 'law': 70,
 'national': 71,
 'story': 72,
 'today': 73,
 'going': 74,
 'made': 75,
