In [41]:
import pandas as pd

In [42]:
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )



Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [43]:
# split the data in chunks and run in parallel
from joblib import Parallel, delayed
from os import cpu_count

# run in parallel
def run_parallel(df, n_jobs, func):
    # call every element in the chunks in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(func)(element) for element in df)
    return results

In [44]:
# clean the text
def clean_column(df):
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, clean_text)
    # replace column with cleaned text
    return results

In [45]:
from nltk.tokenize import word_tokenize
# tokenize the text. run in parallel
def tokenize_column(df):
    # run the function on the data
    n_jobs = cpu_count()
    results = run_parallel(df['content'], n_jobs, word_tokenize)
    
    return results

In [46]:
from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))

    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results

In [47]:
# stemming the text
from nltk.stem import PorterStemmer
def stem_column(df):
    # create a stemmer
    ps = PorterStemmer()

    # stem the text
    def stem(s):
        return [ps.stem(w) for w in s]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, stem)
    return results

In [48]:
# remove punctiuation
import string
def remove_punctuation(df):
    # remove punctuation
    def remove_punct(s):
        return [w for w in s if w not in string.punctuation]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, remove_punct)
    return results

In [49]:
# loading a small part of the data for testing
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', nrows=10000, index_col=0)

In [50]:
from os import path
from os import remove
# create a file to be used for storing the data
def intialize_file(name):
    # check if file exists
    if path.exists(name):
        # if the file exists, delete it
        remove(name)
    # create the file
    with open(name, 'w') as f:
        f.write('') # write an empty string to the file to create it

In [51]:
# append data to csv file
def append_to_file(name, data):
    with open(name, 'a') as f:
        # write the data to the file
        f.write(data)

In [52]:
# create a file to store the data
tokenized_file = 'tokenized_temp.csv'
#intialize_file(tokenized_file)

# append header to the file
header = df.columns.values
append_to_file(tokenized_file, ','.join(header))
append_to_file(tokenized_file, '\n')

In [53]:
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', chunksize=1000, index_col=0)

count = 0
for chunk in df:
    # process data
    chunk['content'] = clean_column(chunk['content'])
    chunk['content'] = tokenize_column(chunk)
    chunk['content'] = remove_stopwords(chunk['content'])
    chunk['content'] = stem_column(chunk['content'])
    chunk['content'] = remove_punctuation(chunk['content'])
    # append data to file
    chunk.to_csv(tokenized_file, mode='a', header=False)

    count += 1
    print(count, 'chunks processed')
    if count == 100:
        break


1 chunks processed
2 chunks processed
3 chunks processed
4 chunks processed
5 chunks processed
6 chunks processed
7 chunks processed
8 chunks processed
9 chunks processed
10 chunks processed
11 chunks processed
12 chunks processed
13 chunks processed
14 chunks processed
15 chunks processed
16 chunks processed
17 chunks processed
18 chunks processed
19 chunks processed
20 chunks processed
21 chunks processed
22 chunks processed
23 chunks processed
24 chunks processed
25 chunks processed
26 chunks processed
27 chunks processed
28 chunks processed
29 chunks processed
30 chunks processed
31 chunks processed
32 chunks processed
33 chunks processed
34 chunks processed
35 chunks processed
36 chunks processed
37 chunks processed
38 chunks processed
39 chunks processed
40 chunks processed
41 chunks processed
42 chunks processed
43 chunks processed
44 chunks processed
45 chunks processed
46 chunks processed
47 chunks processed
48 chunks processed
49 chunks processed
50 chunks processed
51 chunks

In [54]:
# load from file
tokenized_file = 'tokenized_temp.csv'
df = pd.read_csv(tokenized_file)
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [55]:

# count the tokens
def count(s):
    return Counter(s)

# count token frequency
from collections import Counter
def count_tokens(df):


    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, count)

    # total token frequency
    total = {}
    for list in results:
        for k,v in list.items():
            if k in total:
                total[k] += v
            else:
                total[k] = v

    return total

In [56]:
token_freq = count_tokens(df['content'])

In [57]:
# sort the tokens by frequency
token_freq = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)

In [58]:
# creating a set of stopwords from the frequency list
stop_words = set()

In [59]:
# find the 80% percentile
# first finding the total number of tokens
total = 0
for token in token_freq:
    total += token[1]
# then finding the 80% percentile
percentile = int(total * 0.8)
# then finding the token that corresponds to the 80% percentile
total = 0
for token in token_freq:
    total += token[1]
    if total > percentile:
        index = token_freq.index(token)
        break
# add all tokens after the 80% percentile to the stopword list
for token in token_freq[index:]:
    stop_words.add(token[0])

In [60]:
len(stop_words)

384865

In [63]:
# remove custom stopwords from the text
def remove_costume_stopwords(df):
    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df with multiple threads
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results


In [66]:
df['content'] = [[w for w in s if not w in stop_words] for s in df['content']]

In [67]:
# save the data
df.to_csv('ready_data.csv', index=False, header=True)

In [87]:
import pandas as pd
df = pd.read_csv('ready_data.csv')
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [69]:
# split the data into train, test and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['type'], test_size=0.2, random_state=42)
X_test , X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [70]:
y_test.value_counts()*8 / y_train.value_counts()

bias          0.993285
clickbait     0.970310
conspiracy    1.050946
fake          1.004703
hate          1.157447
junksci       0.991031
political     0.982002
reliable      0.548387
rumor         0.692308
satire        0.621908
unknown       1.056604
unreliable    1.195062
Name: type, dtype: float64

In [71]:
# making labels binary
y_train = y_train.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_test = y_test.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_val = y_val.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)

In [72]:
import numpy as np
# function which pads pandas series
def pad_series(s, max_len):
    # truncate the series if it is longer than max_len
    s = s.apply(lambda x: x[:max_len])

    # pad the series
    s = s.apply(lambda x: x + ['<pad>'] * (max_len - len(x)))

    # convert the series to np array
    s = np.array(s.tolist())
    return s

# pad the series and convert them to np arrays
X_test = pad_series(X_test, 750)
X_train = pad_series(X_train, 750)
X_val = pad_series(X_val, 750)

In [73]:
# converting the strings to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# fit the encoder on the training / testing / valdiation data
le.fit(np.concatenate((X_train.flatten(), X_test.flatten(), X_val.flatten())))

# transform the data
X_train = le.transform(X_train.flatten()).reshape(X_train.shape)
X_test = le.transform(X_test.flatten()).reshape(X_test.shape)
X_val = le.transform(X_val.flatten()).reshape(X_val.shape)


In [75]:
# making a baseline model
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)


0.504

In [76]:
# making a baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = LogisticRegression(max_iter=10000, solver='saga')
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)

0.61

In [83]:
# baseline model
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = MLPClassifier(max_iter=10000, hidden_layer_sizes=(750,1000,50))
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)




0.6611

In [84]:
# using a word embedding
import gensim
#load model
model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.txt', no_header=True)

In [85]:
model['<pad>'] = np.zeros(50)

In [98]:
# function which converts a series of tokens to a series of vectors
def convert_to_vectors(s):
    # function which converts a list of tokens to a list of vectors
    def convert(s):
        # convert the tokens to vectors if they are in the model
        s = [model[w] for w in s if w in model]
        # convert the list to np array
        s = np.array(s)
        # calculate the mean of the vectors
        s = np.mean(s, axis=0)
        return s

    # run the function on the df with multiple threads
    n_jobs = cpu_count()
    results = run_parallel(s, n_jobs, convert)

    return results

In [99]:
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', chunksize=10000, index_col=0)

# process the data
df = next(df)
df['content'] = clean_column(df['content'])
df['content'] = tokenize_column(df)
df['content'] = remove_stopwords(df['content'])
df['content'] = remove_punctuation(df['content'])


In [101]:
# convert the data to vectors
df['content'] = [[model[w] for w in s if w in model] for s in df['content']]

0       [[0.51491, 0.88806, -0.71906, -0.5748, 0.85655...
1       [[0.27404, -0.25123, -0.020682, -0.27062, 0.14...
2       [[-0.0097114, 1.0479, -0.15266, 0.95792, -0.64...
3       [[-0.68652, 0.80125, -0.6124, -0.1512, 0.997, ...
4       [[0.12817, 0.15858, -0.38843, -0.39108, 0.6836...
                              ...                        
9995    [[0.50801, 0.67231, -0.85555, -0.55372, 0.6621...
9996    [[0.50801, 0.67231, -0.85555, -0.55372, 0.6621...
9997    [[0.52875, 0.12491, 1.1286, -0.79976, 0.62674,...
9998    [[0.088383, 0.64673, 1.1358, -0.41847, 0.24472...
9999    [[0.26382, 0.32453, 0.74185, -0.37095, 0.65957...
Name: content, Length: 10000, dtype: object

In [None]:
# pad the series and convert them to np arrays
X_test = pad_series(X_test, 750)