In [None]:
import pandas as pd

# Load aggregates dataframe
aggregates_df = pd.read_csv('llm_predictions.csv')

In [None]:
# Calculate period predictions using threshold method
def calculate_period_predictions(aggregates_df, threshold=0.5):
    # Group by match_id and period_id
    period_predictions = []
    for (match_id, period_id), group in aggregates_df.groupby(['match_id', 'period_id']):
        # Calculate the proportion of 1s in the predicted labels
        proportion_of_ones = group['predicted_label'].mean()
        
        # Assign a prediction of 1 if the proportion is above the threshold, otherwise 0
        period_prediction = 1 if proportion_of_ones > threshold else 0
        
        period_predictions.append({
            'match_id': match_id,
            'period_id': period_id,
            'proportion_of_ones': proportion_of_ones,
            'period_prediction': period_prediction
        })
    
    return pd.DataFrame(period_predictions)

In [None]:
# Find optimal threshold
from sklearn.metrics import accuracy_score

def find_optimal_threshold(aggregates_df, thresholds):
    best_threshold = 0
    best_accuracy = 0
    
    for threshold in thresholds:
        period_predictions_df = calculate_period_predictions(aggregates_df, threshold)
        
        # Calculate period-level labels
        period_labels_df = aggregates_df.groupby(['match_id', 'period_id'])['label'].first().reset_index()
        
        # Merge period predictions with period labels
        merged_df = period_predictions_df.merge(period_labels_df, on=['match_id', 'period_id'])
        
        # Calculate accuracy
        accuracy = accuracy_score(merged_df['label'], merged_df['period_prediction'])
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold
    
    return best_threshold, best_accuracy

# Define a range of thresholds to test
thresholds = [i * 0.01 for i in range(0, 101)]

# Find the optimal threshold
optimal_threshold, optimal_accuracy = find_optimal_threshold(aggregates_df, thresholds)
print(f'Optimal Threshold: {optimal_threshold}')
print(f'Optimal Accuracy: {optimal_accuracy:.2%}')

### Calculating predictions over evaluation set using the optimal threshold

In [None]:
# Load evaluation set
import os

evaluation_df = pd.read_csv('cleaned_eval_tweets.csv')

In [None]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# Load the trained model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('./saved_model')
tokenizer = RobertaTokenizer.from_pretrained('./saved_model')

# Set the model to evaluation mode
model.eval()

# Ensure the model is on the GPU
model.to('cuda')
print(model.device)

In [None]:
# Calculate aggregates over the evaluation set

# Define the token limit
TOKEN_LIMIT = 512

# Function to preprocess and aggregate tweets with a progress bar
def create_aggregates(df, tokenizer, token_limit, num_aggregates=30, eval=False):
    import random
    from tqdm import tqdm
    
    aggregates = []
    
    # Group by MatchID and PeriodID
    for (match_id, period_id), group in tqdm(df.groupby(['MatchID', 'PeriodID']), desc="Aggregating Tweets"):
        tweets = group['Tweet'].tolist()
        if not eval:
            event_type = group['EventType'].iloc[0]  # Binary target for the period

        # Generate aggregates for this group
        for _ in range(num_aggregates):
            random.shuffle(tweets)  # Shuffle tweets for randomness
            aggregate = ""
            token_count = 0

            for tweet in tweets:
                # Tokenize tweet and count tokens
                tokenized_tweet = tokenizer.encode(tweet, add_special_tokens=False)
                if token_count + len(tokenized_tweet) > token_limit:
                    break  # Stop adding tweets if token limit is reached
                
                # Add the tweet to the aggregate
                aggregate += tweet + " "
                token_count += len(tokenized_tweet)

            if not eval:
                # Save the aggregate and its label
                aggregates.append({
                    'text': aggregate.strip(),
                    'label': event_type,
                    'match_id': match_id,
                    'period_id': period_id
                })
            else:
                # Save the aggregate
                aggregates.append({
                    'text': aggregate.strip(),
                    'match_id': match_id,
                    'period_id': period_id
                })
    
    return pd.DataFrame(aggregates)

eval_aggregates_df = create_aggregates(evaluation_df, tokenizer, TOKEN_LIMIT, eval=True)

In [None]:
eval_aggregates_df.to_csv('eval_aggregates_predictions.csv', index=False)

In [None]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding="max_length",
        truncation=True,
        max_length=TOKEN_LIMIT,
    )

In [None]:
# Calculate predictions using model

eval_dataset = Dataset.from_pandas(eval_aggregates_df)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
from torch.utils.data import DataLoader

# Create DataLoader for batch processing
dataloader = DataLoader(tokenized_eval_dataset, batch_size=32)

# Make predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Making Predictions"):
        inputs = {key: batch[key].to(model.device) for key in ['input_ids', 'attention_mask']}
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().tolist()
        predictions.extend(batch_predictions)

In [None]:
# Add predictions to the aggregates DataFrame
eval_aggregates_df['predicted_label'] = predictions

In [None]:
# Calculate predictions using optimal threshold
eval_period_predictions_df = calculate_period_predictions(eval_aggregates_df, optimal_threshold)

In [None]:
# Create a new DataFrame with the required columns
new_df = pd.DataFrame({
    'ID': eval_period_predictions_df['match_id'].astype(str) + '_' + eval_period_predictions_df['period_id'].astype(str),
    'EventType': eval_period_predictions_df['period_prediction']
})

print(new_df.head())

In [None]:
# Save to CSV
new_df.to_csv('eval_set_predictions_threshold.csv', index=False)