In [1]:
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

def dummy_model(X_train, y_train, X_test, y_test):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = DummyClassifier(strategy="most_frequent")
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(accuracy_score(y_test, y_pred))
    print(f1_score(y_test, y_pred, average='micro'))

    return dummy_clf

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def logistic_regression(X_train, y_train, X_test, y_test, solver, penalty, c):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver, penalty=penalty, C=c)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} with penalty {penalty} with c = {c}: {accuracy_score(y_test, y_pred)}')
    print(f1_score(y_test, y_pred, average='micro'))

    return dummy_clf

In [3]:
def logistic_regression_none(X_train, y_train, X_test, y_test, solver):
    # making a baseline model

    # create a dummy classifier
    dummy_clf = LogisticRegression(max_iter=10000, solver=solver)
    # fit the classifier
    dummy_clf.fit(X_train, y_train)

    # predict the labels
    y_pred = dummy_clf.predict(X_test)

    # calculate the accuracy
    print(f'Accuracy of {solver} without penalty: {accuracy_score(y_test, y_pred)}')
    print(f1_score(y_test, y_pred, average='micro'))

    return dummy_clf

In [4]:
import pandas as pd
import re
def load_data(input, nrows=None, columns=None):
    # load from csv
    df = pd.read_csv(input, nrows=nrows, usecols=columns)
    pattern = re.compile(r'(?:\'|")([a-z0-9"\.+-/_=:`|~Â©â„¢Â®â–ªâ™¦ðŸ™‚ ]*)(?:\'|"),')
    df['content'] = df['0'].apply(lambda x: pattern.findall(x))
    df.drop(['0'], axis=1, inplace=True)
    return df

In [5]:
from sklearn.model_selection import train_test_split
def split_data(input, answers, test_size=0.1, val_size=0.1):
    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(input, answers, test_size=test_size+val_size, random_state=42)
    # split the test data into test and validation
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=val_size/(test_size+val_size), random_state=42)
    return X_train, X_test, y_train, y_test, X_val, y_val

In [6]:
def pad_series(series, maxlen):
    # truncate the series
    series = series.apply(lambda x: x[:maxlen])
    # pad the series
    return series.apply(lambda x: x + ['']*(maxlen-len(x)))

In [7]:
def load_vocab(input):
    vocab = pd.read_pickle('D:/Fake News Project/file.pkl')
    # the vocab is a list of tuples (word, count)
    # filter out words that appear less than 2000 times
    vocab = [word for word, count in vocab if count > 2000]
    return vocab

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
bow_converter = CountVectorizer(vocabulary=load_vocab('D:/Fake News Project/file.pkl'))
def bow_transformer(series):
    # transform the series into bag of words
    return bow_converter.transform(series.apply(lambda x: ' '.join(x)))




In [46]:
#loading data
df = load_data('fake_news_cleaned_filtered.csv', nrows=1000)  


In [47]:
types_df =  pd.read_csv('fake_news_cleaned.csv', nrows=1000, usecols=['type'], dtype={'tags': 'object', 'type': pd.api.types.CategoricalDtype(['unreliable', 'fake', 'clickbait', 'conspiracy', 'reliable', 'bias', 'hate', 'junksci', 'political'])})
df['type'] = types_df['type']

In [48]:
def labelEncoder(label):
    if label in ['bias', 'clickbait', 'conspiracy', 'political', 'reliable', 'rumor', 'satire']:
        return(0)
    else:
        return(1)


In [49]:
# convert types to unique numbers
df = df.dropna(subset=['type'])
df['type2'] = df['type'].apply(labelEncoder)

In [50]:
bow = bow_transformer(df['content'])

In [51]:
#splitting data
X_train, X_test, y_train, y_test, X_val, y_val = split_data(bow, df['type2'], test_size=0.1, val_size=0.1)

In [52]:
dummy_model(X_train, y_train, X_test, y_test)

0.4659090909090909
0.4659090909090909


In [53]:
logistic_regression_none(X_train, y_train, X_val, y_val, 'lbfgs')
logistic_regression(X_train, y_train, X_val, y_val, 'lbfgs', 'l2', 0.001)
logistic_regression(X_train, y_train, X_val, y_val, 'lbfgs', 'l2', 0.01)
logistic_regression(X_train, y_train, X_val, y_val, 'lbfgs', 'l2', 0.1)
logistic_regression(X_train, y_train, X_val, y_val, 'lbfgs', 'l2', 1)
logistic_regression_none(X_train, y_train, X_val, y_val, 'liblinear')
logistic_regression(X_train, y_train, X_val, y_val, 'liblinear', 'l2', 0.001)
logistic_regression(X_train, y_train, X_val, y_val, 'liblinear', 'l2', 0.01)
logistic_regression(X_train, y_train, X_val, y_val, 'liblinear', 'l2', 0.1)
logistic_regression(X_train, y_train, X_val, y_val, 'liblinear', 'l2', 1)
logistic_regression_none(X_train, y_train, X_val, y_val, 'newton-cg')
logistic_regression(X_train, y_train, X_val, y_val, 'newton-cg', 'l2', 0.001)
logistic_regression(X_train, y_train, X_val, y_val, 'newton-cg', 'l2', 0.01)
logistic_regression(X_train, y_train, X_val, y_val, 'newton-cg', 'l2', 0.1)
logistic_regression(X_train, y_train, X_val, y_val, 'newton-cg', 'l2', 1)
logistic_regression_none(X_train, y_train, X_val, y_val, 'sag')
logistic_regression(X_train, y_train, X_val, y_val, 'sag', 'l2', 0.001)
logistic_regression(X_train, y_train, X_val, y_val, 'sag', 'l2', 0.01)
logistic_regression(X_train, y_train, X_val, y_val, 'sag', 'l2', 0.1)
logistic_regression(X_train, y_train, X_val, y_val, 'sag', 'l2', 1)
logistic_regression_none(X_train, y_train, X_val, y_val, 'saga')
logistic_regression(X_train, y_train, X_val, y_val, 'saga', 'l2', 0.001)
logistic_regression(X_train, y_train, X_val, y_val, 'saga', 'l2', 0.01)
logistic_regression(X_train, y_train, X_val, y_val, 'saga', 'l2', 0.1)
logistic_regression(X_train, y_train, X_val, y_val, 'saga', 'l2', 1)


Accuracy of lbfgs without penalty: 0.797752808988764
0.797752808988764
Accuracy of lbfgs with penalty l2 with c = 0.001: 0.7865168539325843
0.7865168539325842
Accuracy of lbfgs with penalty l2 with c = 0.01: 0.8426966292134831
0.842696629213483
Accuracy of lbfgs with penalty l2 with c = 0.1: 0.8314606741573034
0.8314606741573034
Accuracy of lbfgs with penalty l2 with c = 1: 0.797752808988764
0.797752808988764
Accuracy of liblinear without penalty: 0.797752808988764
0.797752808988764
Accuracy of liblinear with penalty l2 with c = 0.001: 0.8651685393258427
0.8651685393258428
Accuracy of liblinear with penalty l2 with c = 0.01: 0.8314606741573034
0.8314606741573034
Accuracy of liblinear with penalty l2 with c = 0.1: 0.8089887640449438
0.8089887640449437
Accuracy of liblinear with penalty l2 with c = 1: 0.797752808988764
0.797752808988764
Accuracy of newton-cg without penalty: 0.797752808988764
0.797752808988764
Accuracy of newton-cg with penalty l2 with c = 0.001: 0.7865168539325843
0.786

In [64]:
model_used_logit = logistic_regression(X_train, y_train, X_test, y_test, 'liblinear', 'l2', 0.001)
model_used_dummy = dummy_model(X_train, y_train, X_test, y_test)

Accuracy of liblinear with penalty l2 with c = 0.001: 0.7159090909090909
0.715909090909091
0.4659090909090909
0.4659090909090909


In [24]:
#Preprocessing:
#Text Cleaning
from cleantext import clean
#Text Tokenization
from nltk.tokenize import word_tokenize
import nltk
#Text Stopwords
from nltk.corpus import stopwords
#Text lemmetization
from nltk.stem import WordNetLemmatizer
#Text punctiuation
import string
#Plotting
import matplotlib.pyplot as plt

In [25]:
#Clean text:
def clean_text(text):
    return clean(text,lower=True,                     # lowercase text
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_numbers=True,               # replace all numbers with a special token
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_number="<NUM>",
        lang="en"                   
    )

In [26]:
#Remove stopwords text
def remove_stopwords(list_of_words):
    stop_words = set(stopwords.words('english'))

    return(list(filter(lambda word: word not in stop_words, list_of_words)))

In [27]:
#Lemmetize text
def lemmatize(list_of_words):
    lemmatizer = WordNetLemmatizer()

    return(list(lemmatizer.lemmatize(word) for word in list_of_words))

In [28]:
# remove punctiuation
def remove_punctuation(list_of_words):
    return(list(filter(lambda word: word not in string.punctuation, list_of_words)))

In [29]:
def labelEncoder2(label):
    if label in ['true', 'mostly-true', 'half-true']:
        return(0)
    else:
        return(1)

In [59]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ddf = dd.read_csv('test.tsv', sep='\t', blocksize="1GB", names=['json', 'type', 'content', 'a', 'b', 'c', 'd', 'e', 'a1', 'b1', 'c1', 'd1', 'e1', 'a2'], on_bad_lines='skip', engine='python')#dtype={'tags': 'object', 'type': pd.api.types.CategoricalDtype(['unreliable', 'fake', 'clickbait', 'conspiracy', 'reliable', 'bias', 'hate', 'junksci', 'political'])}, usecols=["type", "content"]
ddf = ddf.dropna(subset=['type'])
#Convert Type to Number
#ddf['y_data'] = ddf.type.cat.codes
ddf['y_data'] = ddf.type.apply(labelEncoder2, meta=('type', pd.api.types.CategoricalDtype(['unreliable', 'fake', 'clickbait', 'conspiracy', 'reliable', 'bias', 'hate', 'junksci', 'political'])))
#Clean content
ddf['content_clean'] = ddf.content.apply(clean_text, meta=('content', 'object'))
#Tokenize content
ddf['content_tokenized'] = ddf.content_clean.apply(word_tokenize, meta=('content_clean', 'object'))
#Stopwords content
ddf['content_stopwords'] = ddf.content_tokenized.apply(remove_stopwords, meta=('content_tokenized', 'object'))
#Lemmetize content
ddf['content_lemmetize'] = ddf.content_stopwords.apply(lemmatize, meta=('content_stopwords', 'object'))
#Punctuation content(Cleaned Content)
ddf['content_cleaned'] = ddf.content_lemmetize.apply(remove_punctuation, meta=('content_lemmetize', 'object'))
#x_data content(Cleaned Content)
ddf['x_data'] = ddf.content_cleaned.apply(lambda words: ' '.join(words), meta=('content_cleaned', 'object'))

ProgressBar().register()

In [69]:
result = ddf.compute()

[########################################] | 100% Completed | 755.45 ms


In [71]:
bow = bow_transformer(result.content_cleaned)

In [72]:
predictions_dummy = model_used_dummy.predict(bow)
predictions_logit = model_used_logit.predict(bow)
print(f1_score(result.y_data, predictions_dummy))
print(f1_score(result.y_data, predictions_logit))

0.6078323221180364
0.5464968152866242
