In [1]:
import pandas as pd

In [3]:
#pip install clean-text[GPL]

Collecting clean-text[GPL]
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting ftfy<7.0,>=6.0
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji<2.0.0,>=1.0.0
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting unidecode<2.0.0,>=1.1.1
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 kB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171032 sha256=0da060a50cc8f8b11680f12b43afbc6deed78cd812ae68c4aa3892daa2cfe

In [4]:
# first lets run clean_text on the 'content' column
from cleantext import clean
def clean_text(s):
    return clean(s,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )



In [5]:
# split the data in chunks and run in parallel
from joblib import Parallel, delayed
from os import cpu_count

# run in parallel
def run_parallel(df, n_jobs, func):
    # call every element in the chunks in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(func)(element) for element in df)
    return results

In [6]:
# clean the text
def clean_column(df):
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, clean_text)
    # replace column with cleaned text
    return results

In [9]:
#pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.3/772.3 kB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2022.10.31
Note: you may need to restart the kernel to use updated packages.


In [10]:
from nltk.tokenize import word_tokenize
# tokenize the text. run in parallel
def tokenize_column(df):
    # run the function on the data
    n_jobs = cpu_count()
    results = run_parallel(df['content'], n_jobs, word_tokenize)
    
    return results

In [11]:
from nltk.corpus import stopwords
# removing generic stopwords
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))

    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results

In [12]:
# stemming the text
from nltk.stem import PorterStemmer
def stem_column(df):
    # create a stemmer
    ps = PorterStemmer()

    # stem the text
    def stem(s):
        return [ps.stem(w) for w in s]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, stem)
    return results

In [13]:
# remove punctiuation
import string
def remove_punctuation(df):
    # remove punctuation
    def remove_punct(s):
        return [w for w in s if w not in string.punctuation]

    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, remove_punct)
    return results

In [15]:
# loading a small part of the data for testing
df = pd.read_csv('news_sample.csv', nrows=10000, index_col=0)

In [16]:
from os import path
from os import remove
# create a file to be used for storing the data
def intialize_file(name):
    # check if file exists
    if path.exists(name):
        # if the file exists, delete it
        remove(name)
    # create the file
    with open(name, 'w') as f:
        f.write('') # write an empty string to the file to create it

In [17]:
# append data to csv file
def append_to_file(name, data):
    with open(name, 'a') as f:
        # write the data to the file
        f.write(data)

In [18]:
# create a file to store the data
tokenized_file = 'tokenized_temp.csv'
#intialize_file(tokenized_file)

# append header to the file
header = df.columns.values
append_to_file(tokenized_file, ','.join(header))
append_to_file(tokenized_file, '\n')

In [19]:
df = pd.read_csv('data/news_cleaned_2018_02_13.csv', chunksize=1000, index_col=0)

count = 0
for chunk in df:
    # process data
    chunk['content'] = clean_column(chunk['content'])
    chunk['content'] = tokenize_column(chunk)
    chunk['content'] = remove_stopwords(chunk['content'])
    chunk['content'] = stem_column(chunk['content'])
    chunk['content'] = remove_punctuation(chunk['content'])
    # append data to file
    chunk.to_csv(tokenized_file, mode='a', header=False)

    count += 1
    print(count, 'chunks processed')
    if count == 100:
        break


FileNotFoundError: [Errno 2] No such file or directory: 'D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv'

In [54]:
# load from file
tokenized_file = 'tokenized_temp.csv'
df = pd.read_csv(tokenized_file)
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [55]:

# count the tokens
def count(s):
    return Counter(s)

# count token frequency
from collections import Counter
def count_tokens(df):


    # run the function on the df
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, count)

    # total token frequency
    total = {}
    for list in results:
        for k,v in list.items():
            if k in total:
                total[k] += v
            else:
                total[k] = v

    return total

In [56]:
token_freq = count_tokens(df['content'])

In [57]:
# sort the tokens by frequency
token_freq = sorted(token_freq.items(), key=lambda x: x[1], reverse=True)

In [58]:
# creating a set of stopwords from the frequency list
stop_words = set()

In [59]:
# find the 80% percentile
# first finding the total number of tokens
total = 0
for token in token_freq:
    total += token[1]
# then finding the 80% percentile
percentile = int(total * 0.8)
# then finding the token that corresponds to the 80% percentile
total = 0
for token in token_freq:
    total += token[1]
    if total > percentile:
        index = token_freq.index(token)
        break
# add all tokens after the 80% percentile to the stopword list
for token in token_freq[index:]:
    stop_words.add(token[0])

In [60]:
len(stop_words)

384865

In [63]:
# remove custom stopwords from the text
def remove_costume_stopwords(df):
    # remove stopwords from the text
    def r(s):
        return [w for w in s if not w in stop_words]

    # run the function on the df with multiple threads
    n_jobs = cpu_count()
    results = run_parallel(df, n_jobs, r)

    return results


In [66]:
df['content'] = [[w for w in s if not w in stop_words] for s in df['content']]

In [67]:
# save the data
df.to_csv('ready_data.csv', index=False, header=True)

In [87]:
import pandas as pd
df = pd.read_csv('ready_data.csv')
# load list from string
from ast import literal_eval
df['content'] = df['content'].apply(literal_eval)

In [69]:
# split the data into train, test and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['type'], test_size=0.2, random_state=42)
X_test , X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [70]:
y_test.value_counts()*8 / y_train.value_counts()

bias          0.993285
clickbait     0.970310
conspiracy    1.050946
fake          1.004703
hate          1.157447
junksci       0.991031
political     0.982002
reliable      0.548387
rumor         0.692308
satire        0.621908
unknown       1.056604
unreliable    1.195062
Name: type, dtype: float64

In [71]:
# making labels binary
y_train = y_train.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_test = y_test.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)
y_val = y_val.apply(lambda x: 1 if x in ['fake', 'junksci', 'hate', 'clickbait',] else 0)

In [72]:
import numpy as np
# function which pads pandas series
def pad_series(s, max_len):
    # truncate the series if it is longer than max_len
    s = s.apply(lambda x: x[:max_len])

    # pad the series
    s = s.apply(lambda x: x + ['<pad>'] * (max_len - len(x)))

    # convert the series to np array
    s = np.array(s.tolist())
    return s

# pad the series and convert them to np arrays
X_test = pad_series(X_test, 750)
X_train = pad_series(X_train, 750)
X_val = pad_series(X_val, 750)

In [73]:
# converting the strings to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# fit the encoder on the training / testing / valdiation data
le.fit(np.concatenate((X_train.flatten(), X_test.flatten(), X_val.flatten())))

# transform the data
X_train = le.transform(X_train.flatten()).reshape(X_train.shape)
X_test = le.transform(X_test.flatten()).reshape(X_test.shape)
X_val = le.transform(X_val.flatten()).reshape(X_val.shape)


In [75]:
# making a baseline model
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent")
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)


0.504

In [76]:
# making a baseline model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = LogisticRegression(max_iter=10000, solver='saga')
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)

0.61

In [83]:
# baseline model
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# create a dummy classifier
dummy_clf = MLPClassifier(max_iter=10000, hidden_layer_sizes=(750,1000,50))
# fit the classifier
dummy_clf.fit(X_train, y_train)

# predict the labels
y_pred = dummy_clf.predict(X_test)

# calculate the accuracy
accuracy_score(y_test, y_pred)




0.6611

In [84]:
# using a word embedding
import gensim
#load model
model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.txt', no_header=True)

In [85]:
model['<pad>'] = np.zeros(50)

In [98]:
# function which converts a series of tokens to a series of vectors
def convert_to_vectors(s):
    # function which converts a list of tokens to a list of vectors
    def convert(s):
        # convert the tokens to vectors if they are in the model
        s = [model[w] for w in s if w in model]
        # convert the list to np array
        s = np.array(s)
        # calculate the mean of the vectors
        s = np.mean(s, axis=0)
        return s

    # run the function on the df with multiple threads
    n_jobs = cpu_count()
    results = run_parallel(s, n_jobs, convert)

    return results

In [99]:
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', chunksize=10000, index_col=0)

# process the data
df = next(df)
df['content'] = clean_column(df['content'])
df['content'] = tokenize_column(df)
df['content'] = remove_stopwords(df['content'])
df['content'] = remove_punctuation(df['content'])


In [101]:
# convert the data to vectors
df['content'] = [[model[w] for w in s if w in model] for s in df['content']]

0       [[0.51491, 0.88806, -0.71906, -0.5748, 0.85655...
1       [[0.27404, -0.25123, -0.020682, -0.27062, 0.14...
2       [[-0.0097114, 1.0479, -0.15266, 0.95792, -0.64...
3       [[-0.68652, 0.80125, -0.6124, -0.1512, 0.997, ...
4       [[0.12817, 0.15858, -0.38843, -0.39108, 0.6836...
                              ...                        
9995    [[0.50801, 0.67231, -0.85555, -0.55372, 0.6621...
9996    [[0.50801, 0.67231, -0.85555, -0.55372, 0.6621...
9997    [[0.52875, 0.12491, 1.1286, -0.79976, 0.62674,...
9998    [[0.088383, 0.64673, 1.1358, -0.41847, 0.24472...
9999    [[0.26382, 0.32453, 0.74185, -0.37095, 0.65957...
Name: content, Length: 10000, dtype: object

In [None]:
# pad the series and convert them to np arrays
X_test = pad_series(X_test, 750)

In [19]:
import pandas as pd
df = pd.read_parquet('fake_news_output.parquet')

In [20]:
# find all rows with value 500 in the column 'Unnamed: 0'
df[df['Unnamed: 0'] == 500]

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary,source
500,500,575,beforeitsnews.com,fake,http://beforeitsnews.com/arts/2018/01/chironom...,"b'[""chironomus"",""crassicaudatus"",""victor"",""moz...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Chironomus crassicaudatus by Victor Mozqueda,Baroness Photos,,[''],,,,
10500,500,11835,canadafreepress.com,conspiracy,http://canadafreepress.com/article/cable-monst...,"b'[""\'re"",""leading"",""audio/video"",""cable"",""com...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"Cable Monster Diversifies, Upgrades its Offerings","Jim Bray, Because Without America, There Is No...",,[''],,,,
20500,500,22913,beforeitsnews.com,fake,http://beforeitsnews.com/blogging-citizen-jour...,"b'[""hillary"",""clinton"",""driven"",""tears"",""kelly...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Hillary Clinton Driven to Tears By Kellyanne C...,John Ale,,[''],,,,
30500,500,34297,beforeitsnews.com,fake,http://beforeitsnews.com/survival/2017/10/tip-...,"b'[""``"",""tip"",""iceberg"",""\'\'"",""judge"",""jeanin...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,“Tip of the Iceberg”: Judge Jeanine Destroys t...,,,[''],,,,
40500,500,45390,beforeitsnews.com,fake,http://beforeitsnews.com/alternative/2017/12/c...,"b'[""something"",""else"",""going-on"",""many"",""us"",""...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,CP on Twitter – How to Report,Philosophers Stone,,[''],,,,
50500,500,55804,breitbart.com,political,http://www.breitbart.com/big-government/2015/0...,"b'[""black"",""activist"",""defended"",""confederate""...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,"Anthony Hervey, Black Confederate Flag Support...",Lee Stranahan,,"['Anthony Hervey', 'Arlene Barnum', 'car accid...","Anthony Hervey, well known for wearing a Confe...","Anthony Hervey, Mississippi Flag, Confederate ...",,
60500,500,66272,beforeitsnews.com,fake,http://beforeitsnews.com/survival/2013/11/wher...,"b'[""decent"",""jews"",""headline"",""bitcoin"",""block...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Where Are The Decent Jews?,B Mans Revolt,,[''],,"Decent Jew Review, myths",,
70500,500,77408,beforeitsnews.com,fake,http://beforeitsnews.com/economy/2015/08/china...,"b'[""china"",""moves"",""devalue"",""yuan"",""readers"",...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,China Moves to Devalue the Yuan,,,[''],,,,
80500,500,88342,beforeitsnews.com,fake,http://beforeitsnews.com/war-and-conflict/2011...,"b'[""aware"",""headline"",""bitcoin"",""blockchain"",""...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,War and Conflict,Colonel Sixx,,[''],,,,
90500,500,100084,conservapedia.com,bias,http://www.conservapedia.com/Four_Modernizations,"b'[""conservapedia"",""four"",""modernizations"",""go...",2018-01-25 20:13:50.426130,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Four Modernizations,,,[''],,,,


In [1]:
def tokenize_column(df):
    df['content'] = df['content'].apply(word_tokenize)
    return df

In [3]:
import pandas as pd
df = pd.read_csv('D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv', nrows=100, index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'D:/FakeNews_data/news.csv/news_cleaned_2018_02_13.csv'