# Loading the data and preprocessing the tweets


In [1]:
import pandas as pd
path = "/home/george.ibrahim/Downloads/AI701/Project/110Train.xlsx"

df = pd.read_excel(path)

sources = df['source']
df['concatenated'] = df['headers'] + df['documents']
df.rename(columns={'processed_tweet': 'Headline', 'concatenated': 'articleBody', 'target': 'label'}, inplace=True)
df = df[['articleBody', 'Headline', 'label']]
df.shape

(110, 3)

In [2]:
import re

def clean(text):

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df

Unnamed: 0,articleBody,Headline,label
0,A Family's Perspective – “The Brutality of Sep...,People who try to j-walk while an ambulance is...,0
1,Who's Fault was it really that Cell reached hi...,The episode where Trunks annihilated Freiza is...,0
2,How I almost won an NLP competition without kn...,marksmaponyane Hey ! Sundowns were annihilated...,0
3,The American Genocide of the Indians—Historica...,70 year since we annihilated 100000 people ins...,1
4,"Greatest Stories Ever Told - ""My Brother Esau""...",Shadow boxing the apocalypse,1
...,...,...,...
105,A Family's Perspective – “The Brutality of Sep...,Doing dialysis to my grandpa and oh lord this ...,0
106,my best employee quit on the spot because I wo...,My blood pressure is through the roof I do n't...,0
107,[Terrific Trainwreck Trio Rewatch] Kakumeiki V...,Guys . I have an Imouto Who Is n't Actually Re...,0
108,Blood and Ink Izuku - Dreams Do(n't) Come True...,If it wa n't for the Blood ! ? ? ? ?,0


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
# Path to your saved model
model_path = "/RoBERTa_Fever_7(Balanced)"

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer

class StanceDataset(Dataset):
    def __init__(self, headlines, bodies, labels, tokenizer, max_len):
        self.headlines = headlines
        self.bodies = bodies
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, item):
        headline = str(self.headlines[item])
        body = str(self.bodies[item])
        label = self.labels[item]

        # Combine headline and body for tokenization
        combined_text = headline + " " + body

        encoding = self.tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [5]:
test_dataset = StanceDataset(
    headlines = df.Headline.to_numpy(),
    bodies = df.articleBody.to_numpy(),
    labels = df.label.to_numpy(),
    tokenizer = tokenizer,
    max_len = 350
)

In [6]:
batch_size = 1
test_data_loader = DataLoader(test_dataset, batch_size=batch_size)

In [7]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np
# Initialize lists to store all predictions and labels
all_predictions = []
all_true_labels = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Evaluation
model.eval()

for batch in test_data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    # Flatten the outputs and labels and store them
    all_predictions.extend(np.argmax(logits, axis=1).flatten())
    all_true_labels.extend(label_ids.flatten())

# Compute metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions)  
recall = recall_score(all_true_labels, all_predictions)  
f1_score = f1_score(all_true_labels, all_predictions)  

# Print metrics
print("Test Accuracy: {0:.2f}".format(accuracy))
print("Test Precision: {0:.2f}".format(precision))
print("Test Recall: {0:.2f}".format(recall))
print("Test F1-Score: {0:.2f}".format(f1_score))




Test Accuracy: 0.75
Test Precision: 0.45
Test Recall: 0.58
Test F1-Score: 0.51


In [8]:
import pandas as pd

# Let's assume 'original_dataset' is your initial DataFrame and 'predictions' is the list you obtained
print(all_predictions)
# First, convert the predictions list to a Pandas Series
predictions_series = pd.Series(all_predictions, name='Predictions')

# Now, concatenate this series to the original DataFrame
# Make sure the lengths of 'original_dataset' and 'predictions_series' match
if len(df) == len(predictions_series):
    enhanced_dataset = pd.concat([df, predictions_series], axis=1)
else:
    print("Error: The length of the dataset and the predictions do not match.")

# You can now work with 'enhanced_dataset' which includes your predictions.
enhanced_dataset = pd.DataFrame(enhanced_dataset)
print(enhanced_dataset)


[0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]
                                           articleBody  \
0    A Family's Perspective – “The Brutality of Sep...   
1    Who's Fault was it really that Cell reached hi...   
2    How I almost won an NLP competition without kn...   
3    The American Genocide of the Indians—Historica...   
4    Greatest Stories Ever Told - "My Brother Esau"...   
..                                                 ...   
105  A Family's Perspective – “The Brutality of Sep...   
106  my best employee quit on the spot because I wo...   
107  [Terrific Trainwreck Trio Rewatch] Kakumeiki V...   
108  Blood and Ink Izuku - Dreams Do(n't) Come True...   
109  Is it just me, or does Red