In [34]:
# Imports

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Load the real news dataset
real_data_frame = pd.read_csv('data/real.csv')
real_data_frame.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [15]:
# Load the fake news dataset
fake_data_frame = pd.read_csv('data/fake.csv')
fake_data_frame.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [16]:
# Add labels to keep track of fake or real news
real_data_frame['label'] = 'real'
fake_data_frame['label'] = 'fake'

In [17]:
# Combine the datasets
data_frame = pd.concat([real_data_frame, fake_data_frame], ignore_index=True)
data_frame.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",real


In [18]:
# Check if there are any null values
data_frame.isnull().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [19]:
# How many fake and how many real news we have?
data_frame['label'].value_counts()

label
fake    23481
real    21417
Name: count, dtype: int64

In [20]:
# Create a new column with text lengths and statistically describe it.
data_frame['length'] = data_frame['text'].str.len()  
data_frame['length'].describe()  

count    44898.000000
mean      2469.109693
std       2171.617091
min          1.000000
25%       1234.000000
50%       2186.000000
75%       3105.000000
max      51794.000000
Name: length, dtype: float64

In [21]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(data_frame['text'], data_frame['label'])

In [62]:
# Initialize a TfidfVectorizer without stop words and ignore terms that appear in more than 70% of the documents
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(x_train) 
tfidf_test = tfidf_vectorizer.transform(x_test)

# Initialize a LinearSVC
svc = LinearSVC(max_iter=1000) 
svc.fit(tfidf_train, y_train)

# Predict on the test set
y_pred = svc.predict(tfidf_test)

# Calculate accuracy
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100, 2)}%')



Accuracy: 99.42%


In [38]:
# Define the label mapping
liar_label_mapping = {
    'pants-fire': 'fake',
    'false': 'fake',
    'barely-true': 'fake',
    'half-true': 'real',
    'mostly-true': 'real',
    'true': 'real'
}

# Load the train file and select the relevant columns
liar_train = pd.read_csv('data/train.tsv', sep='\t', header=None)

# Only keep the relevant columns (1 = label, 2 = statement)
liar_train = liar_train[[1, 2]]

# Assign column names
liar_train.columns = ['label', 'statement']

# Map the labels to 'fake' and 'real'
liar_train['label'] = liar_train['label'].map(liar_label_mapping)

# Filter only 'fake' and 'real' labels
liar_train = liar_train[liar_train['label'].isin(['fake', 'real'])]

In [39]:
# Load the validation file
liar_valid = pd.read_csv('data/valid.tsv', sep='\t', header=None)
liar_valid = liar_valid[[1, 2]]
liar_valid.columns = ['label', 'statement']
liar_valid['label'] = liar_valid['label'].map(liar_label_mapping)
liar_valid = liar_valid[liar_valid['label'].isin(['fake', 'real'])]

# Load the test file
liar_test = pd.read_csv('data/test.tsv', sep='\t', header=None)
liar_test = liar_test[[1, 2]]
liar_test.columns = ['label', 'statement']
liar_test['label'] = liar_test['label'].map(liar_label_mapping)
liar_test = liar_test[liar_test['label'].isin(['fake', 'real'])]

# Concatenate the train, validation, and test datasets
liar_data_frame = pd.concat([liar_train[['label', 'statement']], liar_valid[['label', 'statement']], liar_test[['label', 'statement']]], ignore_index=True)

In [27]:
print(f"Real & Fake dataset size: {data_frame.shape}")
print(f"LIAR dataset size: {liar_data_frame.shape}")

Real & Fake dataset size: (44898, 6)
LIAR dataset size: (12791, 2)


In [29]:
# Distribution of labels in the real & fake dataset
print(data_frame['label'].value_counts())

# Distribution of labels in the LIAR dataset
print(liar_data_frame['label'].value_counts())

label
fake    23481
real    21417
Name: count, dtype: int64
label
real    7134
fake    5657
Name: count, dtype: int64


In [42]:
# Now test on the LIAR dataset
tfidf_liar_train = tfidf_vectorizer_real_fake.transform(liar_data_frame['statement'])
y_pred_liar = svc_real_fake.predict(tfidf_liar_train)

# Evaluate on the LIAR dataset
liar_score = accuracy_score(liar_data_frame['label'], y_pred_liar)

In [60]:
# Compare accuray
print(f"Accuracy on LIAR dataset: {round(liar_score * 100, 2)}%")
print(f"Accuracy on real & fake dataset: {round(score_real_fake * 100, 2)}%")

Accuracy on LIAR dataset: 44.7%
Accuracy on real & fake dataset: 99.53%


In [35]:
# Combine the text from both datasets
combined_texts = pd.concat([data_frame['text'], liar_data_frame['statement']], ignore_index=True)

# Vectorize the combined texts
tfidf_combined = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_combined_matrix = tfidf_combined.fit_transform(combined_texts)

# Calculate cosine similarity between the real/fake and liar dataset
cosine_sim = cosine_similarity(tfidf_combined_matrix[:len(data_frame)], tfidf_combined_matrix[len(data_frame):])

# Calculate the average cosine similarity
avg_cosine_sim = cosine_sim.mean()
print(f"Average cosine similarity between the datasets: {avg_cosine_sim}")

Average cosine similarity between the datasets: 0.007153156010399193
