In [11]:
import re
import os
import json
import itertools
import pandas as pd
from sklearn.model_selection import train_test_split

### Reading Dataset

In [12]:
print(os.getcwd())
Data_Path = '../Fake_News_Corpora'

D:\MSAI\Thesis and Project\Mistral\Data\Cleaning_Code


In [13]:
# Read Original Romour-Eval Dataset
Article_Body = pd.read_csv(Data_Path + '/train_bodies.csv')
Claim_Stance = pd.read_csv(Data_Path + '/train_stances.csv')

In [14]:
Article_Body

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...
...,...,...
1678,2528,Intelligence agencies hunting for identity of ...
1679,2529,"While Daleks ""know no fear"" and ""must not fear..."
1680,2530,More than 200 schoolgirls were kidnapped in Ap...
1681,2531,A Guantanamo Bay prisoner released last year a...


In [15]:
# Body ID is same as the Body ID of dataset Article_Body
Claim_Stance

Unnamed: 0,Headline,Body ID,Stance
0,Police find mass graves with at least '15 bodi...,712,unrelated
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated
4,Spider burrowed through tourist's stomach and ...,1923,disagree
...,...,...,...
49967,Urgent: The Leader of ISIL 'Abu Bakr al-Baghda...,1681,unrelated
49968,Brian Williams slams social media for speculat...,2419,unrelated
49969,Mexico Says Missing Students Not Found In Firs...,1156,agree
49970,US Lawmaker: Ten ISIS Fighters Have Been Appre...,1012,discuss


In [16]:
# We only want the examples where the Stance is either Agree or Disagree
Claim_Stance['Stance'].value_counts()

Stance
unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: count, dtype: int64

In [17]:
Claim_Stance_Filtered = Claim_Stance[Claim_Stance['Stance'].isin(['agree', 'disagree'])]

In [18]:
Claim_Stance_Filtered

Unnamed: 0,Headline,Body ID,Stance
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree
4,Spider burrowed through tourist's stomach and ...,1923,disagree
5,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree
8,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree
11,Woman detained in Lebanon is not al-Baghdadi's...,1468,agree
...,...,...,...
49920,Tropical spider burrows under man's skin throu...,1883,agree
49922,"Tour, agent: Tiger Woods not banned",757,agree
49925,Meteorite leaves crater in Nicaraguan capital ...,1913,agree
49927,ESPN to save NFL's image with all-male domesti...,1500,disagree


### Merging Dataset

In [19]:
Final_Data = Claim_Stance_Filtered.merge(Article_Body[['Body ID', 'articleBody']], on='Body ID', how='left') # Left Join

In [20]:
Final_Data

Unnamed: 0,Headline,Body ID,Stance,articleBody
0,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...
1,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's..."
2,'Nasa Confirms Earth Will Experience 6 Days of...,154,agree,Thousands of people have been duped by a fake ...
3,Banksy 'Arrested & Real Identity Revealed' Is ...,1739,agree,If you’ve seen a story floating around on your...
4,Woman detained in Lebanon is not al-Baghdadi's...,1468,agree,An Iraqi official denied that a woman detained...
...,...,...,...,...
4513,Tropical spider burrows under man's skin throu...,1883,agree,A trip to Bali has turned Dylan Thomas into a ...
4514,"Tour, agent: Tiger Woods not banned",757,agree,"PALM BEACH GARDENS, Fla. -- A journeyman profe..."
4515,Meteorite leaves crater in Nicaraguan capital ...,1913,agree,A blast near the Nicaraguan capital city of Ma...
4516,ESPN to save NFL's image with all-male domesti...,1500,disagree,Tonight — finally! — ESPN is going to have an ...


In [21]:
Final_Data['articleBody'].isnull().sum() # No further cleaning required like removing nan values.

0

In [22]:
Final_Data.drop(columns=['Body ID'], inplace=True)

In [23]:
Final_Data

Unnamed: 0,Headline,Stance,articleBody
0,Hundreds of Palestinians flee floods in Gaza a...,agree,Hundreds of Palestinians were evacuated from t...
1,Spider burrowed through tourist's stomach and ...,disagree,"Fear not arachnophobes, the story of Bunbury's..."
2,'Nasa Confirms Earth Will Experience 6 Days of...,agree,Thousands of people have been duped by a fake ...
3,Banksy 'Arrested & Real Identity Revealed' Is ...,agree,If you’ve seen a story floating around on your...
4,Woman detained in Lebanon is not al-Baghdadi's...,agree,An Iraqi official denied that a woman detained...
...,...,...,...
4513,Tropical spider burrows under man's skin throu...,agree,A trip to Bali has turned Dylan Thomas into a ...
4514,"Tour, agent: Tiger Woods not banned",agree,"PALM BEACH GARDENS, Fla. -- A journeyman profe..."
4515,Meteorite leaves crater in Nicaraguan capital ...,agree,A blast near the Nicaraguan capital city of Ma...
4516,ESPN to save NFL's image with all-male domesti...,disagree,Tonight — finally! — ESPN is going to have an ...


In [24]:
Final_Data.rename(columns={'Headline': 'Claim', 'articleBody': 'Perspective'}, inplace=True)

In [25]:
Final_Data = Final_Data[['Claim', 'Perspective', 'Stance']]

### Cleaning Dataset

In [26]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+", "<URL>", text)
    # Remove other invalid charecters
    text = re.sub(r'[^a-zA-Z@#<>\s]', '', text)
    # remove Spaces from begining and End
    text = text.strip()
    
    # Optional: Remove stopwords (consider if relevant for your task)
    # from nltk.corpus import stopwords
    # stop_words = stopwords.words('english')
    # text = [word for word in text.split() if word not in stop_words]
    return text

In [27]:
Final_Data['Perspective'] = Final_Data['Perspective'].apply(clean_text)
Final_Data['Claim'] = Final_Data['Claim'].apply(lambda x: (re.sub(r'[^a-zA-Z@#:\s]', '', x)).strip())

In [28]:
Final_Data

Unnamed: 0,Claim,Perspective,Stance
0,Hundreds of Palestinians flee floods in Gaza a...,hundreds of palestinians were evacuated from t...,agree
1,Spider burrowed through tourists stomach and u...,fear not arachnophobes the story of bunburys s...,disagree
2,Nasa Confirms Earth Will Experience Days of T...,thousands of people have been duped by a fake ...,agree
3,Banksy Arrested Real Identity Revealed Is The...,if youve seen a story floating around on your ...,agree
4,Woman detained in Lebanon is not alBaghdadis w...,an iraqi official denied that a woman detained...,agree
...,...,...,...
4513,Tropical spider burrows under mans skin throug...,a trip to bali has turned dylan thomas into a ...,agree
4514,Tour agent: Tiger Woods not banned,palm beach gardens fla a journeyman professio...,agree
4515,Meteorite leaves crater in Nicaraguan capital ...,a blast near the nicaraguan capital city of ma...,agree
4516,ESPN to save NFLs image with allmale domestic ...,tonight finally espn is going to have an ind...,disagree


### Generating Claim-Perspective Pairs

In [29]:
def generate_pairs(data):
    grouped_data = data.groupby(['Claim', 'Stance']) # Creates an Multi-Indexed Series where each index is an tuple of (Claim, Stance)
    perspective_extract = grouped_data['Perspective'].apply(list) # Result in another series where perspectives are grouped by Claim and Stance.
    stance_pivoted = perspective_extract.unstack(fill_value=[]) # Takes innermost index by default and pivotes it to create an DF (Here Stance).
    # print(type(stance_pivoted)) # un-comment to check datatype of stance_pivoted
    return stance_pivoted.to_dict('index') # convert dataframe to dictionary with keys as index(rows) of dataframe.

In [30]:
paired_data = generate_pairs(Final_Data)
pd.DataFrame.from_dict(paired_data).head()

Unnamed: 0,#Daysofdarkness: No There Wont Be Six Days Of Darkness In December,#Hairgate: Reports of iPhone pulling out hair beards likely exaggerated,#ISIS beheads photojournalist James Wright Foley in a massage to US to end its intervention in #Iraq,A Bogus Banksy,A Bunch Of Folks Are Passing Around This Hoax Video Of A Bird Pooping On Vladimir Putin,A Bunch Of People Thought Banksy Got Arrested But It Was A Hoax,A Husband Breaks Down In Tears After Lost Voicemail Message By His Late Wife Is Recovered,A Letter Lego Sent to Parents in Holds an Important Message for the Parents of,A Mass Grave Points to a Student Massacre in Mexico,A New iOS Bug Can Delete All Of Your iCloud Documents,...,aircraft are missing from terroristheld Tripoli airport ahead of anniversary,eBay is planning an Apple Watch app,iFixit Debunks iPhone Plus Reinforcement Claims,iOS bug could delete your iCloud Drive documents,iPhone Game Developer Quits Job By Hiding The Greatest Letter Of Resignation Ever Inside His Game,men cut their testicles off believing it would help them meet God,s Brat Pack star Judd Nelson forced to deny that hes dead after fake news story starts trending on social media,teen girls get pregnant on school trip Read more at http:wwwwndcomteengirlsgetpregnantonschooltrip#FfilOdyFpASPV,year old burger McDonalds burger purchased in hasnt aged a bit,yearold burger: McDonalds Quarter Pounder looks eerily new after decades
agree,[it may feel like winter is slowly squeezing a...,[late last month as apple fans were joyously s...,[in a video posted online tuesday isis beheads...,[this post was widely shared over the internet...,[russian president vladimir putin last friday ...,[fear not the street artist is still roaming f...,[after stan beatons wife ruby passed away in ...,[a picture of a letter to parents from a box o...,[],[it appears that there may be a serious bug wi...,...,[washington free beacon senior editor bill ger...,[at least one of the big boys is planning on d...,[according to an earlier report it was suggest...,[it appears that there may be a serious bug wi...,[this is possibly the best resignation letter ...,[multimillionaire religious guru gurmeet ram r...,[in a welcome break from a slow weekend of new...,[seven girls aged between and have fallen pr...,[a mcdonalds burger bought years ago has an u...,[two australian men think they may be in posse...
disagree,[],[],[],[],[vladimir putin spoke at the dedication of a n...,[],[],[],[the bodies found in a mass grave were confirm...,[],...,[claim eleven passenger planes are missing fro...,[],[],[],[weve all had jobs we absolutely cant stand an...,[],[],[on december a site called inserbiainfo publi...,[],[]


In [31]:
unique_pairs = []
for claim, perspectives in paired_data.items():
    similar_tweets = perspectives["agree"]
    contrastive_tweets = perspectives["disagree"]
    if similar_tweets and contrastive_tweets:
        for sim_tweet, cont_tweet in itertools.product(similar_tweets, contrastive_tweets):
            unique_pairs.append({
                "claim": claim,
                "perspectives": [
                    {"text": sim_tweet, "label": "agree"},
                    {"text": cont_tweet, "label": "disagree"}
                ]
            })
    elif similar_tweets:
        for sim_tweet in similar_tweets:
            unique_pairs.append({
                "claim": claim,
                "perspectives": [
                    {"text": sim_tweet, "label": "agree"}
                ]
            })
    elif contrastive_tweets:
        for cont_tweet in contrastive_tweets:
            unique_pairs.append({
                "claim": claim,
                "perspectives": [
                    {"text": cont_tweet, "label": "disagree"}
                ]
            })

### Splitting data into Train and Validation set.

In [32]:
training_pairs, validation_pairs = train_test_split(unique_pairs, test_size=0.2, random_state=42)

In [33]:
print("Length of Training Pairs: ", len(training_pairs))
print("Length of Validation Pairs: ", len(validation_pairs))

Length of Training Pairs:  4859
Length of Validation Pairs:  1215


### Output the final dataset as .jsonl file

In [34]:
def save_to_jsonl(data, directory, filename):
    with open(Store_Path+filename, 'w') as outfile:
        for entry in data:
            json.dump(entry, outfile)
            outfile.write('\n')

In [36]:
Store_Path = '../Cleaned_Data/'
save_to_jsonl(training_pairs, Store_Path, 'Claims_Perspectives-Training.jsonl')
save_to_jsonl(validation_pairs, Store_Path, 'Claims_Perspectives-Validation.jsonl')