In [1]:
import pandas as pd
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-12-07 10:20:22.852260: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733563223.227070  833060 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733563223.327716  833060 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-07 10:20:24.304303: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Load the trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('./saved_model')
tokenizer = RobertaTokenizer.from_pretrained('./saved_model')

# Set the model to evaluation mode
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
# Ensure the model is on the GPU
model.to('cuda')
print(model.device)

cuda:0


In [None]:
# Load cleaned tweets dataframe
filtered_df = pd.read_csv('cleaned_llm_tweets.csv')
filtered_df = filtered_df.dropna()

In [None]:
import random
from tqdm import tqdm

# Define the token limit
TOKEN_LIMIT = 512

# Function to preprocess and aggregate tweets with a progress bar
def create_aggregates(df, tokenizer, token_limit, num_aggregates=30, eval=False):
    aggregates = []
    
    # Group by MatchID and PeriodID
    for (match_id, period_id), group in tqdm(df.groupby(['MatchID', 'PeriodID']), desc="Aggregating Tweets"):
        tweets = group['Tweet'].tolist()
        if not eval:
            event_type = group['EventType'].iloc[0]  # Binary target for the period

        # Generate aggregates for this group
        for _ in range(num_aggregates):
            random.shuffle(tweets)  # Shuffle tweets for randomness
            aggregate = ""
            token_count = 0

            for tweet in tweets:
                # Tokenize tweet and count tokens
                tokenized_tweet = tokenizer.encode(tweet, add_special_tokens=False)
                if token_count + len(tokenized_tweet) > token_limit:
                    break  # Stop adding tweets if token limit is reached
                
                # Add the tweet to the aggregate
                aggregate += tweet + " "
                token_count += len(tokenized_tweet)

            if not eval:
                # Save the aggregate and its label
                aggregates.append({
                    'text': aggregate.strip(),
                    'label': event_type,
                    'match_id': match_id,
                    'period_id': period_id
                })
            else:
                # Save the aggregate
                aggregates.append({
                    'text': aggregate.strip(),
                    'match_id': match_id,
                    'period_id': period_id
                })
    
    return pd.DataFrame(aggregates)


In [6]:
# Create aggregates
aggregates_df = create_aggregates(filtered_df, tokenizer, TOKEN_LIMIT)

Aggregating Tweets: 100%|██████████| 2137/2137 [03:00<00:00, 11.82it/s]


In [None]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=TOKEN_LIMIT,
    )

In [8]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(aggregates_df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])


Map: 100%|██████████| 64110/64110 [01:29<00:00, 720.02 examples/s]


In [9]:
from torch.utils.data import DataLoader

# Create DataLoader for batch processing
dataloader = DataLoader(tokenized_dataset, batch_size=16)

# Make predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Making Predictions"):
        inputs = {key: batch[key].to(model.device) for key in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().tolist()
        predictions.extend(batch_predictions)

# Add predictions to the aggregates DataFrame
aggregates_df['predicted_label'] = predictions


Making Predictions: 100%|██████████| 4007/4007 [23:43<00:00,  2.81it/s]


In [None]:
# Save predictions over aggregates to CSV
aggregates_df.to_csv('llm_predictions.csv', index=False)

In [None]:
# Save predictions over aggregates to CSV without text
aggregates_df.drop(columns=['text']).to_csv('llm_predictions_no_text.csv', index=False)

In [12]:
from sklearn.metrics import accuracy_score

# Calculate overall accuracy over aggregates
accuracy = accuracy_score(aggregates_df['label'], aggregates_df['predicted_label'])
print(f'Overall Accuracy: {accuracy:.2%}')

Overall Accuracy: 86.96%


### Calculating predictions over kaggle dataset

In [None]:
# Load evaluation set
import os

eval_df = pd.read_csv('cleaned_eval_tweets.csv')
eval_df.dropna(inplace=True)

In [None]:
# Calculate aggregates for evaluation tweets
eval_aggregates_df = create_aggregates(eval_df, tokenizer, TOKEN_LIMIT, eval=True)

Aggregating Tweets: 100%|██████████| 516/516 [00:42<00:00, 12.09it/s]


In [19]:
# Calculate predictions using model

eval_dataset = Dataset.from_pandas(eval_aggregates_df)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

Map: 100%|██████████| 15480/15480 [00:21<00:00, 707.71 examples/s]


In [20]:
from torch.utils.data import DataLoader

# Create DataLoader for batch processing
dataloader = DataLoader(tokenized_eval_dataset, batch_size=32)

# Make predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Making Predictions"):
        inputs = {key: batch[key].to(model.device) for key in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().tolist()
        predictions.extend(batch_predictions)



Making Predictions: 100%|██████████| 484/484 [05:42<00:00,  1.41it/s]


In [21]:
# Add predictions to the aggregates DataFrame
eval_aggregates_df['predicted_label'] = predictions

In [None]:
# Save evaluation predictions to CSV
eval_aggregates_df.to_csv('eval_aggregates_predictions.csv', index=False)