In [1]:
pip install shap transformers torch




In [2]:
import shap
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

# Load datasets
df_true = pd.read_csv("../data/True.csv")
df_fake = pd.read_csv("../data/Fake.csv")

# Examine the shape of the dataframes
print("Shape of df_fake:", df_fake.shape)
print("Shape of df_true:", df_true.shape)

# Inspect the columns
print("\nColumns of df_fake:", df_fake.columns.values)
print("Columns of df_true:", df_true.columns.values)

# Check data types
print("\nData types of df_fake:\n", df_fake.dtypes)
print("\nData types of df_true:\n", df_true.dtypes)

# Investigate missing values
print("\nMissing values in df_fake:\n", df_fake.isnull().sum())
print("\nMissing values in df_true:\n", df_true.isnull().sum())

# Calculate descriptive statistics for numerical columns (text length)
print("\nAverage text length in df_fake:", df_fake['text'].str.len().mean())
print("Average text length in df_true:", df_true['text'].str.len().mean())

# Analyze subject distributions
print("\nSubject distribution in df_fake:\n", df_fake['subject'].value_counts())
print("\nSubject distribution in df_true:\n", df_true['subject'].value_counts())

# Analyze date ranges (basic check)
print("\nEarliest date in df_fake:", df_fake['date'].min())
print("Latest date in df_fake:", df_fake['date'].max())
print("\nEarliest date in df_true:", df_true['date'].min())
print("Latest date in df_true:", df_true['date'].max())



Shape of df_fake: (23481, 4)
Shape of df_true: (21417, 4)

Columns of df_fake: ['title' 'text' 'subject' 'date']
Columns of df_true: ['title' 'text' 'subject' 'date']

Data types of df_fake:
 title      object
text       object
subject    object
date       object
dtype: object

Data types of df_true:
 title      object
text       object
subject    object
date       object
dtype: object

Missing values in df_fake:
 title      0
text       0
subject    0
date       0
dtype: int64

Missing values in df_true:
 title      0
text       0
subject    0
date       0
dtype: int64

Average text length in df_fake: 2547.396235254035
Average text length in df_true: 2383.278517065882

Subject distribution in df_fake:
 News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

Subject distribution in df_true:
 politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

Earlies

In [3]:
# Handle the invalid dates in df_fake
# Given the large number, dropping rows might introduce bias
# Drop rows with invalid dates

# Remove duplicated rows from df_fake_cleaned and keep the first occurrence
df_fake_cleaned = df_fake.drop_duplicates(keep='first')

# Remove duplicated rows from df_true and keep the first occurrence
df_true = df_true.drop_duplicates(keep='first')

# Remove rows with empty strings in the 'text' column from df_fake_cleaned
df_fake_cleaned = df_fake_cleaned[df_fake_cleaned['text'].str.strip() != '']

# Remove rows with empty strings in the 'text' column from df_true
df_true = df_true[df_true['text'].str.strip() != '']


# Re-check the date range and missing values
print("\nDate range analysis after cleaning:")
print("Earliest date in df_fake_cleaned:", df_fake_cleaned['date'].min())
print("Latest date in df_fake_cleaned:", df_fake_cleaned['date'].max())

print("\nNull values in 'date' column after cleaning:")
print("df_fake_cleaned:", df_fake_cleaned['date'].isnull().sum())
print("df_true:", df_true['date'].isnull().sum())
# Explore other potential issues
# Check for duplicated rows
print("\nNumber of duplicated rows in df_fake_cleaned:", df_fake_cleaned.duplicated().sum())
print("Number of duplicated rows in df_true:", df_true.duplicated().sum())

# Further analysis
# Example: Check for empty strings in 'text' columns
print("\nEmpty strings in 'text' column:")
print("df_fake_cleaned:", df_fake_cleaned['text'].str.strip().eq('').sum())
print("df_true:", df_true['text'].str.strip().eq('').sum())

# Examine the shape of the dataframes
print("\nShape of df_fake:", df_fake_cleaned.shape)
print("Shape of df_true:", df_true.shape)


Date range analysis after cleaning:
Earliest date in df_fake_cleaned: 14-Feb-18
Latest date in df_fake_cleaned: https://fedup.wpengine.com/wp-content/uploads/2015/04/hillarystreetart.jpg

Null values in 'date' column after cleaning:
df_fake_cleaned: 0
df_true: 0

Number of duplicated rows in df_fake_cleaned: 0
Number of duplicated rows in df_true: 0

Empty strings in 'text' column:
df_fake_cleaned: 0
df_true: 0

Shape of df_fake: (22848, 4)
Shape of df_true: (21210, 4)


In [4]:
# Analyze subject distributions after cleaning df_fake
print("\nSubject distribution in df_fake_cleaned:\n", df_fake_cleaned['subject'].value_counts(normalize=True))
print("\nSubject distribution in df_true:\n", df_true['subject'].value_counts(normalize=True))


from collections import Counter

# Function to find top keywords
def find_top_keywords(df, top_n=10):
    all_words = []
    for text in df['text']:
      all_words.extend(text.lower().split())
    word_counts = Counter(all_words)
    return word_counts.most_common(top_n)


# Find top keywords in each dataset
top_keywords_fake = find_top_keywords(df_fake_cleaned)
top_keywords_true = find_top_keywords(df_true)

print("\nTop keywords in fake news:")
print(top_keywords_fake)

print("\nTop keywords in true news:")
print(top_keywords_true)


Subject distribution in df_fake_cleaned:
 News               0.396096
politics           0.281425
left-news          0.188594
Government News    0.065564
US_News            0.034270
Middle-east        0.034051
Name: subject, dtype: float64

Subject distribution in df_true:
 politicsNews    0.528949
worldnews       0.471051
Name: subject, dtype: float64

Top keywords in fake news:
[('the', 525499), ('to', 288545), ('of', 235142), ('and', 222323), ('a', 209585), ('in', 162835), ('that', 144891), ('s', 128330), ('is', 107714), ('for', 91063)]

Top keywords in true news:
[('the', 471986), ('to', 241454), ('of', 202253), ('a', 194234), ('and', 178653), ('in', 177291), ('on', 106376), ('that', 83948), ('for', 78206), ('said', 71167)]


In [None]:
df_fake_cleaned.to_csv('../data/fake_cleaned.csv', index=False)  # Save to a new file
df_true.to_csv('../data/true_cleaned.csv', index=False)  # Save to a new file

In [6]:
# new_true_df = pd.read_csv("fake_cleaned.csv")
# new_fake_df = pd.read_csv("true_cleaned.csv")