# Loading the data and preprocessing the tweets


In [1]:
import pandas as pd
path = "/100Test.xlsx"
df = pd.read_excel(path)

sources = df['source']
preds = df['preds']
df['concatenated'] = df['headers'] + df['documents']
df.rename(columns={'processed_tweet': 'Headline', 'concatenated': 'articleBody', 'target': 'label'}, inplace=True)
df = df[['articleBody', 'Headline', 'label']]
df.shape

(100, 3)

In [2]:
import re

def clean(text):

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text



# df['Headline'] = df['Headline'].apply(clean)
# df['articleBody'] = df['articleBody'].apply(clean)

df

Unnamed: 0,articleBody,Headline,label
0,What is the abomination of desolation? | GotQu...,( the abomination that maketh desolate the ant...,0
1,Country Reports on Terrorism 2019 - United Sta...,militant attack police post in udhampur 2 spos...,1
2,"Women's Handbags & Purses | Crossbody, Leather...",new lady shoulder tote handbag faux leather ...,0
3,Tornado Survivor StoriesNWSAll NOAA Safety Nat...,there wa a loud bang outside earlier and i che...,0
4,COVID denialism and policy clarifications : r/...,reddit 's new content policy go into effect ma...,0
...,...,...,...
95,"Confederate ship blown up by crew | August 6, ...",on thisdayinhistory in 1862 confederate ship b...,1
96,How to help kids and teens with their mental h...,mprnews 600 ! ! ! wow ! ! ! that 's a lot of t...,0
97,The Pomodoro Technique — Why It Works & How To...,the thing with rule is break it once it become...,0
98,The Ohio toxic train wreck was '100% preventab...,300k exotic car wrecked in train accident URL,1


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
# Path to your saved model
model_path = "/RoBERTa_Fever_7(Balanced)"

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer

class StanceDataset(Dataset):
    def __init__(self, headlines, bodies, labels, tokenizer, max_len):
        self.headlines = headlines
        self.bodies = bodies
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, item):
        headline = str(self.headlines[item])
        body = str(self.bodies[item])
        label = self.labels[item]

        # Combine headline and body for tokenization
        combined_text = headline + " " + body

        encoding = self.tokenizer.encode_plus(
            combined_text,
            # headline,
            # body,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [5]:
test_dataset = StanceDataset(
    headlines = df.Headline.to_numpy(),
    bodies = df.articleBody.to_numpy(),
    labels = df.label.to_numpy(),
    tokenizer = tokenizer,
    max_len = 350
)

In [6]:
batch_size = 1
test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

In [7]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np
# Initialize lists to store all predictions and labels
all_predictions = []
all_true_labels = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Evaluation
model.eval()

for batch in test_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Flatten the outputs and labels and store them
    all_predictions.extend(np.argmax(logits, axis=1).flatten())
    all_true_labels.extend(label_ids.flatten())

# Compute metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions)  
recall = recall_score(all_true_labels, all_predictions)  
f1_score = f1_score(all_true_labels, all_predictions)  

# Print metrics
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))




Test Accuracy: 0.77
Test Precision: 0.80
Test Recall: 0.73
Test F1-Score: 0.76


In [8]:
import pandas as pd

# Let's assume 'original_dataset' is your initial DataFrame and 'predictions' is the list you obtained
print(all_predictions)
# First, convert the predictions list to a Pandas Series
predictions_series = pd.Series(all_predictions, name='Predictions')

# Now, concatenate this series to the original DataFrame
# Make sure the lengths of 'original_dataset' and 'predictions_series' match
if len(df) == len(predictions_series):
    enhanced_dataset = pd.concat([df, predictions_series], axis=1)
else:
    print("Error: The length of the dataset and the predictions do not match.")

# You can now work with 'enhanced_dataset' which includes your predictions.
enhanced_dataset = pd.DataFrame(enhanced_dataset)
print(enhanced_dataset)


[0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]
                                          articleBody  \
0   What is the abomination of desolation? | GotQu...   
1   Country Reports on Terrorism 2019 - United Sta...   
2   Women's Handbags & Purses | Crossbody, Leather...   
3   Tornado Survivor StoriesNWSAll NOAA Safety Nat...   
4   COVID denialism and policy clarifications : r/...   
..                                                ...   
95  Confederate ship blown up by crew | August 6, ...   
96  How to help kids and teens with their mental h...   
97  The Pomodoro Technique — Why It Works & How To...   
98  The Ohio toxic train wreck was '100% preventab...   
99  Missing Malaysia plane MH370: What we know - B...   

              

In [9]:
enhanced_dataset


Unnamed: 0,articleBody,Headline,label,Predictions
0,What is the abomination of desolation? | GotQu...,( the abomination that maketh desolate the ant...,0,0
1,Country Reports on Terrorism 2019 - United Sta...,militant attack police post in udhampur 2 spos...,1,1
2,"Women's Handbags & Purses | Crossbody, Leather...",new lady shoulder tote handbag faux leather ...,0,1
3,Tornado Survivor StoriesNWSAll NOAA Safety Nat...,there wa a loud bang outside earlier and i che...,0,0
4,COVID denialism and policy clarifications : r/...,reddit 's new content policy go into effect ma...,0,1
...,...,...,...,...
95,"Confederate ship blown up by crew | August 6, ...",on thisdayinhistory in 1862 confederate ship b...,1,0
96,How to help kids and teens with their mental h...,mprnews 600 ! ! ! wow ! ! ! that 's a lot of t...,0,0
97,The Pomodoro Technique — Why It Works & How To...,the thing with rule is break it once it become...,0,0
98,The Ohio toxic train wreck was '100% preventab...,300k exotic car wrecked in train accident URL,1,1


In [10]:
enhanced_dataset['preds_old'] = preds
enhanced_dataset['Source'] = sources
preds_2 = enhanced_dataset['Predictions']

In [11]:
enhanced_dataset

Unnamed: 0,articleBody,Headline,label,Predictions,preds_old,Source
0,What is the abomination of desolation? | GotQu...,( the abomination that maketh desolate the ant...,0,0,0,https://www.gotquestions.org/abomination-desol...
1,Country Reports on Terrorism 2019 - United Sta...,militant attack police post in udhampur 2 spos...,1,1,1,https://www.state.gov/reports/country-reports-...
2,"Women's Handbags & Purses | Crossbody, Leather...",new lady shoulder tote handbag faux leather ...,0,1,0,https://tjmaxx.tjx.com/store/shop/womens-handb...
3,Tornado Survivor StoriesNWSAll NOAA Safety Nat...,there wa a loud bang outside earlier and i che...,0,0,1,https://www.weather.gov/safety/tornado-survivors
4,COVID denialism and policy clarifications : r/...,reddit 's new content policy go into effect ma...,0,1,0,https://www.reddit.com/r/redditsecurity/commen...
...,...,...,...,...,...,...
95,"Confederate ship blown up by crew | August 6, ...",on thisdayinhistory in 1862 confederate ship b...,1,0,1,https://www.history.com/this-day-in-history/co...
96,How to help kids and teens with their mental h...,mprnews 600 ! ! ! wow ! ! ! that 's a lot of t...,0,0,1,https://www.mprnews.org/episode/2023/03/01/how...
97,The Pomodoro Technique — Why It Works & How To...,the thing with rule is break it once it become...,0,0,0,https://todoist.com/productivity-methods/pomod...
98,The Ohio toxic train wreck was '100% preventab...,300k exotic car wrecked in train accident URL,1,1,1,https://www.cnn.com/2023/02/23/us/ohio-train-d...


In [12]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
# Print metrics

#Measuring accuracy precision and recall between predictions from stage 1 and stage 2

accuracy = accuracy_score(preds, preds_2)
precision = precision_score(preds, preds_2)  # Use 'binary' for binary classification
recall = recall_score(preds, preds_2)  # Use 'binary' for binary classification
f1_score = f1_score(preds, preds_2)  # Use 'binary' for binary classification

print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))

Test Accuracy: 0.68
Test Precision: 0.65
Test Recall: 0.65
Test F1-Score: 0.65


In [13]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
# Print metrics


#Measuring accuracy precision and recall between predictions from stage 1 and stage 2 when stage 1 predicted ones
enhanced_dataset2 = enhanced_dataset[enhanced_dataset['preds_old'] == 1]


accuracy = accuracy_score(enhanced_dataset2['Predictions'], enhanced_dataset2['preds_old'])
precision = precision_score(enhanced_dataset2['Predictions'], enhanced_dataset2['preds_old']) 
recall = recall_score(enhanced_dataset2['Predictions'], enhanced_dataset2['preds_old'])
f1_score = f1_score(enhanced_dataset2['Predictions'], enhanced_dataset2['preds_old']) 
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))

Test Accuracy: 0.65
Test Precision: 0.65
Test Recall: 1.00
Test F1-Score: 0.79


In [14]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
# Print metrics


#Measuring accuracy precision and recall between predictions from stage 1 and stage 2 when stage 1 predictions were wrong
enhanced_dataset3 = enhanced_dataset[enhanced_dataset['preds_old'] != enhanced_dataset['label']]
print(enhanced_dataset3)

accuracy = accuracy_score(enhanced_dataset3['Predictions'], enhanced_dataset3['label']) 
precision = precision_score(enhanced_dataset3['Predictions'], enhanced_dataset3['label']) 
recall = recall_score( enhanced_dataset3['Predictions'],enhanced_dataset3['label'])
f1_score = f1_score(enhanced_dataset3['Predictions'], enhanced_dataset3['label']) 
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))

                                          articleBody  \
3   Tornado Survivor StoriesNWSAll NOAA Safety Nat...   
24  Forgotten Hope: Secret Weapon mod for Battlefi...   
26  ASL American Sign LanguageThank you!Nice to me...   
28  Ouch! I've burned my hand on a hot pan — now w...   
30  Selena Gomez's Dating History: From Justin Bie...   
32  Nighttime Blood Pressure in Normotensive Subje...   
37  Ojuelegba accident: Family of four follow die ...   
41  COVID denialism and policy clarifications : r/...   
65  Why is it so important to close doors during a...   
66  Massachusetts Municipal AssociationOur members...   
71  UntitledAmbulance-based emergency medical syst...   
72  Israel-Hamas war updates: Biden says 20 aid tr...   
75  News Flash • Nassau County DA,NY • CivicEngage...   
78  Chronic Stress, Drug Use, and Vulnerability to...   
80  Discontinued Victorinox Swiss Army WatchesSwis...   
91  Revamping the Quarantine Function : r/announce...   
96  How to help kids and teens 

In [15]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
# Print metrics


#Measuring accuracy precision and recall between predictions from stage 1 and stage 2 when stage 1 predictions were wrong
enhanced_dataset4 = enhanced_dataset[enhanced_dataset['preds_old'] == enhanced_dataset['label']]


accuracy = accuracy_score(enhanced_dataset4['Predictions'], enhanced_dataset4['label']) 
precision = precision_score(enhanced_dataset4['Predictions'], enhanced_dataset4['label']) 
recall = recall_score( enhanced_dataset4['Predictions'],enhanced_dataset4['label'])
f1_score = f1_score(enhanced_dataset4['Predictions'], enhanced_dataset4['label']) 
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))

Test Accuracy: 0.77
Test Precision: 0.75
Test Recall: 0.77
Test F1-Score: 0.76
