In [None]:
import pandas as pd

# Load aggregates dataframe with model predictions
llm_results = pd.read_csv('llm_predictions_no_text.csv')

In [25]:
llm_results.head()

Unnamed: 0,label,match_id,period_id,predicted_label
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0


In [None]:
# Inspect MatchIDs present in the dataset
set(llm_results['match_id'].values.tolist())

{0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 17, 18, 19}

In [None]:
# Matches from 0 to 17 will be our training data, while matches 18 and 19 will be our test data

train_llm_results = llm_results[llm_results['match_id'] < 18]
test_llm_results = llm_results[llm_results['match_id'] >= 18]

In [None]:
# Create features dataframe
features_df = (
    train_llm_results.groupby(["match_id", "period_id", "label"])["predicted_label"]
    .value_counts()
    .unstack(fill_value=0)
    .reset_index()
)

# Renaming columns for clarity
features_df.columns = ["match_id", "period_id", "label", "count_0", "count_1"]

In [41]:
test_features_df = (
    test_llm_results.groupby(["match_id", "period_id", "label"])["predicted_label"]
    .value_counts()
    .unstack(fill_value=0)
    .reset_index()
)

# Renaming columns for clarity
test_features_df.columns = ["match_id", "period_id", "label", "count_0", "count_1"]

In [34]:
features_df.head()

Unnamed: 0,match_id,period_id,label,count_0,count_1
0,0,0,0,29,1
1,0,1,0,30,0
2,0,2,0,30,0
3,0,3,0,30,0
4,0,4,0,30,0


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Dataset class
class MatchDataset(Dataset):
    def __init__(self, df, has_labels=True):
        self.data = df[["period_id", "count_0", "count_1"]].values
        self.has_labels = has_labels
        if has_labels:
            self.labels = df["label"].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.has_labels:
            return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)
        else:
            return torch.tensor(self.data[idx], dtype=torch.float32)


# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)  # Only use the last hidden state
        output = self.fc(h_n[-1])  # h_n[-1] is the hidden state of the last LSTM layer
        return output

# Training function
def train_lstm_on_matches(model, criterion, optimizer, features_df, num_epochs=5):
    model.train()
    match_ids = features_df["match_id"].unique()
    
    for match_id in match_ids: # Iterate over each match during training
        print(f"Training on match_id: {match_id}")
        match_data = features_df[features_df["match_id"] == match_id]
        dataset = MatchDataset(match_data)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

        for epoch in range(num_epochs):
            total_loss = 0.0
            for features, labels in dataloader:
                features = features.unsqueeze(0)  # Add batch dimension
                labels = labels.unsqueeze(1)  # Ensure labels match output shape (batch_size, 1)

                optimizer.zero_grad()
                outputs = model(features)
                loss = criterion(outputs, labels.view_as(outputs))  # Match target shape to model output
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            if (epoch + 1) % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

# Function to calculate accuracy
def calculate_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in dataloader:
            features = features.unsqueeze(0)  # Add batch dimension
            outputs = model(features)
            predictions = torch.sigmoid(outputs).round()  # Convert logits to binary predictions
            correct += (predictions.view_as(labels).long() == labels.long()).sum().item()
            total += labels.size(0)
    return correct / total if total > 0 else 0

# Main workflow
def main_workflow(features_df, test_features_df, num_epochs=50):
    input_dim = 3  # period_id, count_0, count_1
    hidden_dim = 16
    output_dim = 1  # Predicting the label
    num_layers = 1

    # Initialize the model, loss function, and optimizer
    model = LSTMModel(input_dim, hidden_dim, output_dim, num_layers)
    criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy for binary classification
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Train the model on the training data
    train_lstm_on_matches(model, criterion, optimizer, features_df, num_epochs)

    # Evaluate accuracy on the entire training set
    train_dataset = MatchDataset(features_df)
    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)
    train_accuracy = calculate_accuracy(model, train_dataloader)
    print(f"\nTraining Accuracy: {train_accuracy:.2f}")

    # Evaluate accuracy on the test set
    test_dataset = MatchDataset(test_features_df)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)
    test_accuracy = calculate_accuracy(model, test_dataloader)
    print(f"Testing Accuracy: {test_accuracy:.2f}")

    return model, train_accuracy, test_accuracy

# Execute the workflow
model, train_accuracy, test_accuracy = main_workflow(features_df, test_features_df, num_epochs=5)
print(f"Final Training Accuracy: {train_accuracy:.2f}")
print(f"Final Testing Accuracy: {test_accuracy:.2f}")


Training on match_id: 0
Training on match_id: 1
Training on match_id: 2
Training on match_id: 3
Training on match_id: 4
Training on match_id: 5
Training on match_id: 7
Training on match_id: 8
Training on match_id: 10
Training on match_id: 11
Training on match_id: 12
Training on match_id: 13
Training on match_id: 14
Training on match_id: 17

Training Accuracy: 0.87
Testing Accuracy: 0.91
Final Training Accuracy: 0.87
Final Testing Accuracy: 0.91


### Calculate predictions over evaluation data using the meta-model we trained

In [56]:
eval_df = pd.read_csv('eval_aggregates_predictions.csv').drop(columns=['text'])
eval_df.head()

Unnamed: 0,match_id,period_id,predicted_label
0,6,0,0
1,6,0,0
2,6,0,0
3,6,0,0
4,6,0,0


In [57]:
eval_features_df = (
    eval_df.groupby(["match_id", "period_id"])["predicted_label"]
    .value_counts()
    .unstack(fill_value=0)
    .reset_index()
)

# Renaming columns for clarity
eval_features_df.columns = ["match_id", "period_id", "count_0", "count_1"]

In [58]:
eval_features_df.head()

Unnamed: 0,match_id,period_id,count_0,count_1
0,6,0,27,3
1,6,1,22,8
2,6,2,29,1
3,6,3,22,8
4,6,4,26,4


In [None]:
# Function to calculate predictions and store them in a new DataFrame
def generate_predictions_dataframe(model, eval_df):
    model.eval()
    predictions = []
    ids = []

    # Create dataset and dataloader without labels
    eval_dataset = MatchDataset(eval_df, has_labels=False)
    eval_dataloader = DataLoader(eval_dataset, batch_size=1, shuffle=False)

    with torch.no_grad():
        for idx, features in enumerate(eval_dataloader):
            features = features.unsqueeze(0)  # Add batch dimension
            outputs = model(features)
            predicted_label = torch.sigmoid(outputs).round().item()  # Convert logits to binary prediction

            # Generate the ID in the format MatchID_PeriodID
            match_id = eval_df.iloc[idx]["match_id"]
            period_id = eval_df.iloc[idx]["period_id"]
            id_str = f"{match_id}_{period_id}"

            ids.append(id_str)
            predictions.append(int(predicted_label))

    # Create a new DataFrame with the results
    result_df = pd.DataFrame({"ID": ids, "EventType": predictions})
    return result_df

# Generate predictions
predictions_df = generate_predictions_dataframe(model, eval_features_df)

print(predictions_df.head())


    ID  EventType
0  6_0          0
1  6_1          0
2  6_2          0
3  6_3          0
4  6_4          0


In [69]:
predictions_df["EventType"].value_counts()

EventType
1    301
0    215
Name: count, dtype: int64

In [70]:
predictions_df.to_csv('predictions_llm_plus_lstm.csv', index=False)