In [1]:
import pandas as pd
from langdetect import detect, LangDetectException
import numpy as np
from sklearn.model_selection import train_test_split
import dask.dataframe as dd

In [2]:
# Function to check if a value is a number
def is_number(value):
    try:
        float(value)
        return True
    except ValueError:
        return False
    
# Function to detect the language of the text
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'
    
# Function to count the number of words in a text
def word_count(text):
    return len(text.split())

### Process FNC Data

In [3]:
# Load the CSV file
df_1 = pd.read_csv('Data/Fake News Competition/train.csv')
df_2 = pd.read_csv('Data/Fake News Competition/test.csv')
df_3 = pd.read_csv('Data/Fake News Competition/submit.csv')

df_2.set_index('id', inplace=True)
df_2['label'] = np.nan
df_3.set_index('id', inplace=True)

df_2['label'].fillna(df_3['label'], inplace=True)
df_2.reset_index(inplace=True)

df = pd.concat([df_1, df_2], axis=0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_2['label'].fillna(df_3['label'], inplace=True)


In [4]:
# Remove rows with no labels
df_filtered = df[df['label'].apply(is_number)]

# Remove rows without text
df_filtered = df_filtered[~df_filtered['text'].isna()]

# Remove non-english entry
df_filtered['language'] = df_filtered['text'].apply(detect_language)
df_filtered = df_filtered[df_filtered['language'] == 'en']

# Remove rows where the 'text' column has fewer than 20 words
df_filtered = df_filtered[df_filtered['text'].apply(word_count) >= 20]

# Drop language column
df_filtered_final = df_filtered.drop('language', axis=1)

# Save the cleaned DataFrame to a new CSV file
# df_filtered_final.to_csv('FNC_data_clean.csv', index=False)

In [5]:
df_FNC_master = pd.read_csv('FNC_data_clean.csv')

# Split data into 70% training and 30% temporary sets
train_df, temp_df = train_test_split(df_FNC_master, test_size=0.3, random_state=42)

# Split the temporary set into 50% validation and 50% test sets (0.15 each of the original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Verify the lengths of each set
print(f'Training set length: {len(train_df)}')
print(f'Validation set length: {len(val_df)}')
print(f'Test set length: {len(test_df)}')

# Save the dataset
# train_df.to_csv('Model/Data_indi/FNC/train_df.csv')
# val_df.to_csv('Model/Data_indi/FNC/val_df.csv')
# test_df.to_csv('Model/Data_indi/FNC/test_df.csv')

Training set length: 17361
Validation set length: 3720
Test set length: 3721


### Processing ISOT Data

In [None]:
# ISOT dataset
df_ISOT_True = pd.read_csv("Data/ISOT Dataset/True_manually_cleaned.csv")
df_ISOT_Fake = pd.read_csv("Data/ISOT Dataset/Fake_manually_cleaned_new.csv")

df_ISOT = pd.concat([df_ISOT_True, df_ISOT_Fake], axis=0)

In [None]:
# Combine ISOT and Fake News
df_FNC_master_clean = df_FNC_master[['title', 'text', 'label']]
df_ISOT_clean = df_ISOT[['title', 'text', 'label']]

df_master = pd.concat([df_ISOT_clean, df_FNC_master_clean], axis=0)

# Split data into 70% training and 30% temporary sets
train_df, temp_df = train_test_split(df_master, test_size=0.3, random_state=42)

# Split the temporary set into 50% validation and 50% test sets (0.15 each of the original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Verify the lengths of each set
print(f'Training set length: {len(train_df)}')
print(f'Validation set length: {len(val_df)}')
print(f'Test set length: {len(test_df)}')

In [None]:
# Save the cleaned DataFrame to a new CSV file
df_master.to_csv('FNC+ISOT_data_clean.csv', index=False)

# Save the dataset
train_df.to_csv('Model/Data_indi/ISOT/train_df.csv')
val_df.to_csv('Model/Data_indi/ISOT/val_df.csv')
test_df.to_csv('Model/Data_indi/ISOT/test_df.csv')

### Processing FakeNewsCorpus Data

In [None]:
# Process the data in batch
i = 0
sampled_rows = []
for chunk in pd.read_csv('Data/news_cleaned_2018_02_13.csv',chunksize = 50000, usecols= ['id', 'type', 'content', 'title'],
                 lineterminator='\n'):
    i += 1
    print(i)
    sampled_chunk = chunk.sample(n=1000, random_state=i)
    sampled_rows.append(sampled_chunk)

sampled_df = pd.concat(sampled_rows, ignore_index=True) # Combine all sampled rows into a single DataFrame

In [None]:
# Save the cleaned DataFrame to a new CSV file
sampled_df.to_csv('news_data_raw.csv', index=False)

In [6]:
sampled_df = pd.read_csv('news_data_raw.csv')

In [None]:
# Check data classfication
pd.unique(sampled_df['type'])

In [None]:
# Remove rows without text
df_news_filtered = sampled_df[~sampled_df['type'].isna()]
df_news_filtered = df_news_filtered[~df_news_filtered['title'].isna()]
df_news_filtered = df_news_filtered[~df_news_filtered['content'].isna()]

# Normalise catagory
label_mapping = {'fake': 1, 
                 'political': 2, 
                 'unreliable': 2, 
                 'conspiracy': 1, 
                 'unknown': 2,
                 'bias': 2, 
                 'hate': 2, 
                 'junksci': 1, 
                 'reliable': 0, 
                 'clickbait': 2, 
                 'satire': 2,
                 'rumor': 1}

df_news_filtered['label'] = df_news_filtered['type'].replace(label_mapping)

In [None]:
# Extract the reliable entries
df_news_filtered_fake = df_news_filtered[df_news_filtered['label'] == 1]
# Remove non-english entry
df_news_filtered_fake['language'] = df_news_filtered_fake['content'].apply(detect_language)
df_news_filtered_fake = df_news_filtered_fake[df_news_filtered_fake['language'] == 'en']

# Remove rows where the 'content' column has fewer than 20 words
df_news_filtered_fake = df_news_filtered_fake[df_news_filtered_fake['content'].apply(word_count) >= 20]

In [9]:
# Extract the reliable entries
df_news_filtered_reliable = df_news_filtered[df_news_filtered['label'] == 0]
# Remove non-english entry
df_news_filtered_reliable['language'] = df_news_filtered_reliable['content'].apply(detect_language)
df_news_filtered_reliable = df_news_filtered_reliable[df_news_filtered_reliable['language'] == 'en']

# Remove rows where the 'content' column has fewer than 20 words
df_news_filtered_reliable = df_news_filtered_reliable[df_news_filtered_reliable['content'].apply(word_count) >= 20]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_news_filtered_reliable['language'] = df_news_filtered_reliable['content'].apply(detect_language)


In [None]:
# Rename the column
df_news_filtered_reliable = df_news_filtered_reliable.rename(columns={'content': 'text'})
df_news_filtered_fake = df_news_filtered_fake.rename(columns={'content': 'text'})

In [None]:
# Select 25,000 from both df
df_news_filtered_reliable_select = df_news_filtered_reliable.sample(n=25000, random_state=42)
df_news_filtered_fake_select = df_news_filtered_fake.sample(n=25000, random_state=42)

# Only keep columns needed
df_news_filtered_reliable_select_clean = df_news_filtered_reliable_select[['title', 'text', 'label']]
df_news_filtered_fake_select_clean = df_news_filtered_fake_select[['title', 'text', 'label']]

# Combine fake and reliable df
df_news_clean = pd.concat([df_news_filtered_reliable_select_clean, df_news_filtered_fake_select_clean], axis=0)

In [None]:
# Split data into 70% training and 30% temporary sets
train_df, temp_df = train_test_split(df_news_clean, test_size=0.3, random_state=42)

# Split the temporary set into 50% validation and 50% test sets (0.15 each of the original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Verify the lengths of each set
print(f'Training set length: {len(train_df)}')
print(f'Validation set length: {len(val_df)}')
print(f'Test set length: {len(test_df)}')

# Save the dataset
train_df.to_csv('Model/Data_indi/FNC/train_df.csv')
val_df.to_csv('Model/Data_indi/FNC/val_df.csv')
test_df.to_csv('Model/Data_indi/FNC/test_df.csv')

### Combine all datasets

In [None]:
# Combine all df
df_master = pd.concat([df_ISOT_clean, df_FNC_master_clean, df_news_clean], axis=0)

df_master.replace('â€™', "'", regex=True, inplace=True)
df_master.replace('â€˜', "‘", regex=True, inplace=True)
df_master.replace('â€œ', "“", regex=True, inplace=True)
df_master.replace('â€¦', "...", regex=True, inplace=True)
df_master.replace('â€\x9d', "", regex=True, inplace=True)

# Split data into 70% training and 30% temporary sets
train_df, temp_df = train_test_split(df_master, test_size=0.3, random_state=42)

# Split the temporary set into 50% validation and 50% test sets (0.15 each of the original data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [None]:
# Write csv
train_df.to_csv('train_df.csv', index=False)
val_df.to_csv('val_df.csv', index=False)
test_df.to_csv('test_df.csv', index=False)

In [None]:
# Check data
df_test = pd.read_csv('val_df_clean.csv')