In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import cleantext

In [None]:
# Reading the sample set
df = pd.read_csv('news_sample.csv')

# Defining our text cleaning function for news sample
def clean_module(text):
    return cleantext.clean(text, clean_all= False, # Execute all cleaning operations
    extra_spaces=True ,# Remove extra white spaces 
    lowercase=True ,# Convert to lowercase
    numbers=True ,# Remove all digits 
    punct=True ,# Remove all punctuations
    reg=r'[^\w\s]'
    )

# Defining function to remove stopwords and stemming
def rem_stopwords(text):
    return cleantext.clean(text,stopwords=True)
def stem(text):
  return cleantext.clean(text,stemming=True)
def count_words(text):
    return len(text.split())

# Cleaning the "content" column using our function and tokenizing it
df['content'] = df['content'].apply(clean_module)
vocab_sum = df['content'].str.split().apply(len).sum()
tokenlist = df['content'].apply(word_tokenize).tolist()
# Removing stopwords
df['content'] = df['content'].apply(rem_stopwords)
filtered_sum= df['content'].str.split().apply(len).sum()
stopwordlist = df['content'].apply(word_tokenize).tolist()
# Stemming
df['content'] = df['content'].apply(stem)
stemmed_sum = df['content'].str.split().apply(len).sum()
# Calculating reduction rate when stopwords have been removed and when stemming
print("Vocabulary size:", vocab_sum ) 
print("Vocabulary size without stopwords:",  filtered_sum, ", resulting in reduction rate:", filtered_sum/vocab_sum)
print("Vocabulary size after stemming with no stopwords:", stemmed_sum, ", resulting in reduction rate:", stemmed_sum/filtered_sum)


In [None]:
from collections import Counter
import matplotlib.pyplot as plt
# Using Counter to count the frequency of each word
word_counts = Counter(sum(tokenlist, []))

# Get the 50 most common words
top_words = word_counts.most_common(50)

# Separate out the words and their counts
word_labels = [word[0] for word in top_words]
word_freqs = [word[1] for word in top_words]

import matplotlib.pyplot as plt
from collections import Counter

# Create the first subplot
plt.subplots(figsize=(25, 6))
plt.subplot(1, 2, 1)
plt.title('Top 50 words before processing')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.bar(word_labels, word_freqs)
plt.xticks(rotation=90)

# Create the second subplot
word_counts = Counter(sum(stopwordlist, []))
top_words = word_counts.most_common(50)
word_labels = [word[0] for word in top_words]
word_freqs = [word[1] for word in top_words]

plt.subplot(1, 2, 2)
plt.title('Top 50 words after removing stopwords')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.bar(word_labels, word_freqs)
plt.xticks(rotation=90)

# Show the figure
plt.subplots_adjust(wspace=0.15) # adjust the width space between subplots
plt.show()

In [None]:
import pandas as pd
# Loading the Fakenews dataset and extracting 10%

# Set the file path
file_path = 'news_cleaned_2018_02_13.csv'

# Determine the number of rows to skip based on the 10% sampling rate
num_rows = sum(1 for line in open(file_path, encoding='utf-8')) // 10

# Load the data, skipping every 10th row
dff = pd.read_csv(file_path, skiprows=lambda i: i % 10 != 0, encoding='utf-8')


In [None]:
# Defining new cleaning function for the big dataset
def clean_module(text):
    return cleantext.clean(text, clean_all= False,
extra_spaces=True , 
stemming=True , 
stopwords=True ,
lowercase=True ,
numbers=True , 
punct=True ,
reg=r'[^\w\s]',
stp_lang='english'
)

# Cleaning the reduced 10% dataset
dff = dff.loc[dff['content'].apply(lambda x: not isinstance(x, float))] #Removes rows where content is a float to avoid errors
dff['content'] = dff['content'].apply(clean_module)


In [None]:
# Saving the reduced dataset to avoid the long reading and cleaning process again
df.to_csv('10percentcleandata.csv', index=False)

In [None]:
df = pd.read_csv('10percentcleandata.csv')

In [None]:
unique_values = df['type'].unique()

# printing the unique values of labels
print(unique_values)

In [None]:
# create a dictionary to map the values to "fake" or "reliable"
mapping = {'rumor': 'fake',
           'hate': 'fake',
           'unreliable': 'fake',
           'conspiracy': 'fake',
           'clickbait': 'fake',
           'satire': 'fake',
           'bias': 'fake',
           'junksci': 'fake',
           'political': 'reliable'}

# replace the values using the mapping dictionary
df['type'] = df['type'].replace(mapping)
# filter out the rows where 'type' is not equal to either "reliable" or "fake"
df = df[(df['type'] == 'reliable') | (df['type'] == 'fake')]
# Sampling the dataset to avoid having only a few domains in the test set since they are ordered
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
# Simple model with only content
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a CountVectorizer object to transform the documents into a bag of words representation
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform them into a matrix of word counts
wordvector = vectorizer.fit_transform(df['content'])

# Splitting the (reduced) fakenews dataset into training set and test set
train_size = int(len(df) * .8)
X_train = wordvector[:train_size]
y_train = df['type'][:train_size]

X_test = wordvector[train_size:]
y_test = df['type'][train_size:]

# Train a logistic regression model on the training data
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Predict the labels for the fakenews test data
y_pred = clf.predict(X_test)

# Calculating accuracy and f1-score of the model on the fakenews test data
simple_accuracy_fakenews = accuracy_score(y_test, y_pred)
simple_fscore_fakenews = f1_score(y_test, y_pred, pos_label = "reliable")
print("Accuracy: ", simple_accuracy_fakenews)
print("F1-Score: ", simple_fscore_fakenews)
print(confusion_matrix(y_test, y_pred))

In [None]:
# Reading in the LIAR test set
liar = pd.read_csv('test.tsv',sep='\t')

# Mapping
mapping = {'false': 'fake',
           'half-true': 'fake',
           'pants-fire': 'fake',
           'true': 'reliable',
           'barely-true': 'fake',
           'mostly-true': 'reliable'
           }
liar['true'] = liar['true'].replace(mapping)


# Cleaning the LIAR test set with our cleaning function
# We notice the LIAR test set has no headers hence the weird labels
liar = liar.loc[liar['Building a wall on the U.S.-Mexico border will take literally years.'].apply(lambda x: not isinstance(x, float))]
liar['Building a wall on the U.S.-Mexico border will take literally years.'] = liar['Building a wall on the U.S.-Mexico border will take literally years.'].apply(clean_module)

# Using our simple model to predict labels
X_test = vectorizer.transform(liar["Building a wall on the U.S.-Mexico border will take literally years."])
y_pred = clf.predict(X_test)

# Calculating accuracy and f1-score of the model on the LIAR test data
simple_fscore_liar = f1_score(liar["true"], y_pred, pos_label="reliable")
simple_accuracy_liar = accuracy_score(liar["true"], y_pred)
print("Accuracy:", simple_accuracy_liar)
print("F1-Score:", simple_fscore_liar)
print(confusion_matrix(liar["true"], y_pred))

In [None]:
# Simple model with Domain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a CountVectorizer object to transform the documents into a bag of words representation
vectorizer = CountVectorizer()

# Concatenate the domain information with the content of each document
df['content'] = df['domain'] + df['content']
# Fit the vectorizer to the documents and transform them into a matrix of word counts
wordvector = vectorizer.fit_transform(df['content'])

# Print the vocabulary of the vectorizer (the unique words in the corpus)
print(vectorizer.vocabulary_)

train_size = int(len(df) * .8)
X_train = wordvector[:train_size]
y_train = df['type'][:train_size]

X_test = wordvector[train_size:]
y_test = df['type'][train_size:]

# Train a logistic regression classifier on the training data
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Predict the labels for the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier on the test data
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Data for advanced model

# Reading in our reduced cleaned fakenews dataset
df = pd.read_csv('10percentcleandata.csv')

# Mapping
mapping = {'rumor': 'unreliable',
           'hate': 'fake',
           'unreliable': 'unreliable',
           'conspiracy': 'unreliable',
           'clickbait': 'unreliable',
           'satire': 'satire',
           'bias': 'unreliable',
           'junksci': 'fake',}
df['type'] = df['type'].replace(mapping)

# Make sure only rows with the correct labels are present
df = df[(df['type'] == 'reliable') | (df['type'] == 'fake')| (df['type'] == 'unreliable')| (df['type'] == 'satire')]
# Sampling the dataset to avoid having only a few domains in the test set since they are ordered
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

We chose to split our articles into 4 types: "unreliable" (composed of "rumor", "conspiracy", "clickbait", "bias" and of course itself), "fake" which is composed of "hate","junksci" and itself, "satire" which is only composed of itself and "reliable" which is only composed of itself. We chose to eliminate all the articles flagged as "political" as we don't see it fit in with any of the other categories since it by itself doesn't say anything about the truthfullness of the article.

In [None]:
# Finding the best NN parameters
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# Splitting into training and test set
train_size = int(len(df) * .8)
train_content = df['content'][:train_size]
train_type = df['type'][:train_size]
test_content = df['content'][train_size:]
test_type = df['type'][train_size:]

# Defining a range of initial neuron values to try
neuron_values = [128, 256, 512, 1024, 2048]

# Set the other hyperparameters
max_words = 2000
batch_size = 1024
epochs = 1
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_content)
x_train = tokenize.texts_to_matrix(train_content)
x_test = tokenize.texts_to_matrix(test_content)
encoder = LabelEncoder()
encoder.fit(train_type)
y_train = encoder.transform(train_type)
y_test = encoder.transform(test_type)
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

# Train and evaluate the models for different numbers of neurons
for num_neurons in neuron_values:
    print(f"Training model with {num_neurons} neurons...")
    # Build the model
    model = Sequential()
    model.add(Dense(num_neurons, input_shape=(max_words,)))
    model.add(Activation('relu'))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1)

    # Evaluate the model on the test set
    score = model.evaluate(x_test, y_test,
                           batch_size=batch_size,
                           verbose=1)
    print(f"Test accuracy for {num_neurons} neurons: {score[1]}")

We see that the best model was with 2048 neurons so we'll train a model below with 2048 neurons that use a larger batch size and two epochs

In [None]:
# Full model training:
batch_size = 500
epochs = 2
model = Sequential()
model.add(Dense(2048, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
history = model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_split=0.1)
    # Evaluate the model on the test set
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

test_predictions = model.predict(x_test)
test_predicted_labels = np.argmax(test_predictions, axis=1)
encoder = LabelEncoder()
encoder.fit(train_type)
y_test = encoder.transform(test_type)
# Calculate F1 score
from sklearn.metrics import f1_score
test_f1 = f1_score(y_test, test_predicted_labels, average='weighted')
print("Test F1 score:", test_f1)

In [None]:
#Test on Liar set
liar = pd.read_csv('test.tsv', sep="\t")
def clean_module(text):
    return cleantext.clean(text, clean_all= False,
extra_spaces=True , 
stemming=True , 
stopwords=True ,
lowercase=True ,
numbers=True , 
punct=True ,
stp_lang='english'
  # Language for stop words
)
mapping = {'false': 'fake',
           'half-true': 'unreliable',
           'pants-fire': 'fake',
           'true': 'reliable',
           'barely-true': 'unreliable',
           'mostly-true': 'reliable'
           }
liar['true'] = liar['true'].replace(mapping)

# filter out the rows where 'Column1' is not equal to either "reliable" or "fake"
liar = liar[(liar['true'] == 'reliable') | (liar['true'] == 'fake')| (liar['true'] == 'unreliable')| (liar['true'] == 'satire')]

liar = liar.loc[liar['Building a wall on the U.S.-Mexico border will take literally years.'].apply(lambda x: not isinstance(x, float))]
liar['Building a wall on the U.S.-Mexico border will take literally years.'] = liar['Building a wall on the U.S.-Mexico border will take literally years.'].apply(clean_module)

In [None]:
# Preprocess the text data
new_content = liar['Building a wall on the U.S.-Mexico border will take literally years.']
new_text = tokenize.texts_to_matrix(new_content)
predictions = model.predict(new_text)
predicted_labels = np.argmax(predictions, axis=1)
true_labels = np.array(liar['true'])
encoder = LabelEncoder()
encoder.fit(train_type)
true_labels = encoder.transform(liar['true'])
print(np.shape(true_labels))
# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print("F1 score:", f1)

num_classes = np.max(true_labels) + 1
y_test = utils.to_categorical(true_labels, num_classes)

score = model.evaluate(new_text, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])


In [None]:
from sklearn.metrics import confusion_matrix

confusion_mat = confusion_matrix(true_labels, predicted_labels)
print(encoder.classes_)
print(confusion_mat)
