In [None]:
import gc
import math
import os

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from lightgbm import early_stopping
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.model_selection import GridSearchCV

### Aggregating Preprocessed Data for Training and Testing - Simple Mean Aggregation of Embeddings

In [None]:
def string_to_array(embedding_str):
    """
    Converting string representation of embedding to numpy array.
    Handles newlines and scientific notation.
    """
    # Removing brackets and newlines
    cleaned = embedding_str.strip('[]').replace('\n', ' ')
    return np.array([float(x) for x in cleaned.split()])


def expand_embeddings(aggregated_df, loaded_df_from_csv=False):
    """
    Expands BERT embeddings from the given DataFrame column into individual columns for each dimension of
    the embedding. Optionally handles string representations of embeddings if the data is loaded from a CSV.
    """
    print("Expanding embeddings...")

    # Getting embedding dimensions
    if loaded_df_from_csv:
        bert_embedding_dim = string_to_array(aggregated_df['BERT_Embedding'].iloc[0]).shape[0]
        aggregated_df['BERT_Embedding'] = aggregated_df['BERT_Embedding'].apply(string_to_array)
    else:
        bert_embedding_dim = aggregated_df['BERT_Embedding'].iloc[0].shape[0]

    # Creating column names
    bert_columns = [f'BERT_{i}' for i in range(bert_embedding_dim)]

    # Converting embeddings to DataFrames
    bert_features = pd.DataFrame(
        np.stack(aggregated_df['BERT_Embedding'].values),
        columns=bert_columns
    )
    # Combining DataFrames
    expanded_df = pd.concat([
        aggregated_df.drop(columns=['BERT_Embedding']),
        bert_features
    ], axis=1)

    return expanded_df, bert_columns

In [None]:
def process_large_dataset(filepath, chunk_size=10000, columns_to_load=None, mode='train'):
    """
    Processing large training or test CSV files in chunks.

    Args:
        filepath (str): Path to the CSV file
        chunk_size (int): Number of rows to process in each chunk
        columns_to_load (list): Columns to load from the CSV file
        mode (str): Operating mode of the function ('train' or 'test'), which
        affects the inclusion of the 'EventType' column; defaults to 'train'.
    """

    # Getting embedding dimensions from first row
    first_chunk = next(pd.read_csv(filepath, nrows=1, usecols=['BERT_Embedding'], chunksize=1))
    bert_dim = len(string_to_array(first_chunk['BERT_Embedding'].iloc[0]))
    # Initializing aggregators
    running_stats = {
        'sums': {},  # Storing sums for mean calculations
        'total_counts': {},  # Storing counts for mean calculations
        'first_values': {}  # Storing first occurrences for period-wise constant features
    }

    # Reading and processing the CSV file in chunks
    print("Processing chunks...")
    chunks_iterator = pd.read_csv(filepath, chunksize=chunk_size, usecols=columns_to_load)

    for chunk in tqdm(chunks_iterator):

        # Converting embeddings strings to arrays before processing
        chunk['BERT_Embedding'] = chunk['BERT_Embedding'].apply(string_to_array)

        # Processing each ID in the chunk
        for id_group, group in chunk.groupby(['ID', 'PeriodID']):
            # Initializing if this ID hasn't been seen before
            if id_group not in running_stats['sums']:
                running_stats['sums'][id_group] = {
                    'Sentiment_joy': 0,
                    'Sentiment_anger': 0,
                    'Sentiment_fear': 0,
                    'Sentiment_sadness': 0,
                    'Sentiment_surprise': 0,
                    'Sentiment_Score': 0,
                    'Exclamation_Count': 0,
                    'Question_Count': 0,
                    'Uppercase_Ratio': 0,
                    'Repeated_Char_Word_Ratio': 0,
                    'Gives_Score': 0,
                    'BERT_Embedding': np.zeros(bert_dim)
                }
                running_stats['first_values'][id_group] = {
                    'Is_Key_Period': group['Is_Key_Period'].iloc[0],
                    'EventType': group['EventType'].iloc[0] if mode == 'train' else None,
                    'PeriodID': group['PeriodID'].iloc[0],
                    'ID': group['ID'].iloc[0]
                }
                running_stats['total_counts'][id_group] = 0

            # Updating sums for mean calculations
            n = len(group)
            running_stats['total_counts'][id_group] += n

            # Updating sums for each metric
            running_stats['sums'][id_group]['Sentiment_joy'] += group['Sentiment_joy'].sum()
            running_stats['sums'][id_group]['Sentiment_anger'] += group['Sentiment_anger'].sum()
            running_stats['sums'][id_group]['Sentiment_fear'] += group['Sentiment_fear'].sum()
            running_stats['sums'][id_group]['Sentiment_sadness'] += group['Sentiment_sadness'].sum()
            running_stats['sums'][id_group]['Sentiment_surprise'] += group['Sentiment_surprise'].sum()
            running_stats['sums'][id_group]['Sentiment_Score'] += group['Sentiment_Score'].sum()
            running_stats['sums'][id_group]['Exclamation_Count'] += group['Exclamation_Count'].sum()
            running_stats['sums'][id_group]['Question_Count'] += group['Question_Count'].sum()
            running_stats['sums'][id_group]['Uppercase_Ratio'] += group['Uppercase_Ratio'].sum()
            running_stats['sums'][id_group]['Repeated_Char_Word_Ratio'] += group['Repeated_Char_Word_Ratio'].sum()
            running_stats['sums'][id_group]['Gives_Score'] += group['Gives_Score'].sum()

            # Updating embedding sums
            running_stats['sums'][id_group]['BERT_Embedding'] += np.sum(np.vstack(group['BERT_Embedding']), axis=0)

        # Forcing garbage collection after each chunk
        gc.collect()

    # Computing final aggregated results
    print("Computing final aggregations...")
    result_data = []

    for id_group in running_stats['total_counts'].keys():
        count = running_stats['total_counts'][id_group]

        result_dict = {
            'ID': running_stats['first_values'][id_group]['ID'],
            'PeriodID': running_stats['first_values'][id_group]['PeriodID'],
            'Tweet_Count': count,
            'Is_Key_Period': running_stats['first_values'][id_group]['Is_Key_Period'],
            'EventType': running_stats['first_values'][id_group]['EventType'],
            'Sentiment_joy': running_stats['sums'][id_group]['Sentiment_joy'] / count,
            'Sentiment_anger': running_stats['sums'][id_group]['Sentiment_anger'] / count,
            'Sentiment_fear': running_stats['sums'][id_group]['Sentiment_fear'] / count,
            'Sentiment_sadness': running_stats['sums'][id_group]['Sentiment_sadness'] / count,
            'Sentiment_surprise': running_stats['sums'][id_group]['Sentiment_surprise'] / count,
            'Sentiment_Score': running_stats['sums'][id_group]['Sentiment_Score'] / count,
            'Exclamation_Count': running_stats['sums'][id_group]['Exclamation_Count'],
            'Question_Count': running_stats['sums'][id_group]['Question_Count'],
            'Uppercase_Ratio': running_stats['sums'][id_group]['Uppercase_Ratio'] / count,
            'Repeated_Char_Word_Ratio': running_stats['sums'][id_group]['Repeated_Char_Word_Ratio'] / count,
            'Gives_Score': running_stats['sums'][id_group]['Gives_Score'],
            'BERT_Embedding': running_stats['sums'][id_group]['BERT_Embedding'] / count
        }

        result_data.append(result_dict)

    aggregated_df = pd.DataFrame(result_data)

    return aggregated_df

In [None]:
def compute_lagged_features(aggregated_df, loaded_df_from_csv=False):
    """
    Compute lagged features for the aggregated DataFrame.
    """
    aggregated_df, bert_columns = expand_embeddings(aggregated_df, loaded_df_from_csv=loaded_df_from_csv)
    # Adding a column for Match ID
    aggregated_df['Match_ID'] = aggregated_df['ID'].str.split('_').str[0].astype(int)

    # List of relevant columns for lagged features
    columns_to_lag = [
        'Sentiment_joy', 'Sentiment_anger', 'Sentiment_fear', 'Sentiment_sadness', 'Sentiment_surprise',
        'Sentiment_Score', 'Exclamation_Count', 'Question_Count', 'Uppercase_Ratio',
        'Repeated_Char_Word_Ratio', 'Gives_Score', 'Tweet_Count'
    ]
    # Computing lagged features per match
    lagged_features = []
    for col in columns_to_lag:
        # Sorting within groups and computing differences
        aggregated_df = (aggregated_df.groupby('Match_ID')
                         .apply(lambda x: x.sort_values('PeriodID'))
                         .reset_index(drop=True))

        aggregated_df[f'{col}_lag1'] = aggregated_df.groupby('Match_ID')[col].shift(1)
        aggregated_df[f'{col}_diff'] = aggregated_df[col] - aggregated_df[f'{col}_lag1']
        lagged_features.extend([f'{col}_lag1', f'{col}_diff'])

    prefix = 'BERT_'
    # Computing the squared L2 norm of embedding changes
    aggregated_df[f'{prefix}embedding_change'] = (
        aggregated_df[bert_columns]
        .diff(periods=1, axis=0)
        .pow(2)
        .sum(axis=1)
        .fillna(0)
    )
    lagged_features.extend([f'{prefix}embedding_change'])

    aggregated_df[lagged_features] = aggregated_df[lagged_features].fillna(0)

    return aggregated_df, bert_columns

### Sophisticated Embedding Aggregation Methods  (Attention, Similarity and Adaptive Temperature Similarity)

In [None]:
class OptimizedEmbeddingAggregator:
    """
    Memory-efficient GPU-accelerated embedding aggregation
    """

    def __init__(self, batch_size=5000):
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
        self.batch_size = batch_size
        print(f"Using device: {self.device}")

    def _chunked_matrix_multiply(self, mat1, mat2, chunk_size=128):
        """
        Method that is performing matrix multiplication in chunks to avoid memory issues
        """
        rows = mat1.size(0)
        cols = mat2.size(1)
        result = torch.zeros(rows, cols, device=self.device)

        for i in range(0, rows, chunk_size):
            end = min(i + chunk_size, rows)
            result[i:end] = torch.mm(mat1[i:end], mat2)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            elif torch.backends.mps.is_available():
                torch.mps.empty_cache()

        return result

    def attention_aggregate(self, embeddings_tensor):
        """
        Memory-efficient attention-based aggregation
        """
        try:
            n_tweets, embedding_dim = embeddings_tensor.shape
            print(f"Processing attention aggregation for {n_tweets} tweets")

            if n_tweets <= 128:
                scores = torch.mm(embeddings_tensor, embeddings_tensor.t())
                scores = scores / math.sqrt(embedding_dim)
                weights = torch.softmax(scores, dim=-1)
                return torch.mm(weights.mean(dim=0).unsqueeze(0), embeddings_tensor).squeeze(0)

            scores = self._chunked_matrix_multiply(
                embeddings_tensor,
                embeddings_tensor.t()
            ) / math.sqrt(embedding_dim)

            weights = torch.zeros_like(scores)
            for i in range(0, n_tweets, 128):
                end = min(i + 128, n_tweets)
                weights[i:end] = torch.softmax(scores[i:end], dim=-1)

            avg_weights = weights.mean(dim=0)
            result = torch.mm(avg_weights.unsqueeze(0), embeddings_tensor).squeeze(0)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            elif torch.backends.mps.is_available():
                torch.mps.empty_cache()

            return result

        except RuntimeError as e:
            print(f"Error in attention aggregate: {str(e)}")
            return embeddings_tensor.mean(dim=0)

    def similarity_aggregate(self, embeddings_tensor):
        """
        Memory-efficient similarity-based aggregation
        """
        try:
            n_tweets, embedding_dim = embeddings_tensor.shape
            print(f"Processing similarity aggregation for {n_tweets} tweets")

            if n_tweets <= 128:
                normalized = embeddings_tensor / (embeddings_tensor.norm(dim=1, keepdim=True) + 1e-8)
                similarities = torch.mm(normalized, normalized.t())
                weights = torch.softmax(similarities.mean(dim=1), dim=0)
                return torch.mm(weights.unsqueeze(0), embeddings_tensor).squeeze(0)

            normalized = embeddings_tensor / (embeddings_tensor.norm(dim=1, keepdim=True) + 1e-8)
            similarities = self._chunked_matrix_multiply(normalized, normalized.t())

            mean_similarities = similarities.mean(dim=1)
            weights = torch.softmax(mean_similarities, dim=0)

            result = torch.mm(weights.unsqueeze(0), embeddings_tensor).squeeze(0)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            elif torch.backends.mps.is_available():
                torch.mps.empty_cache()

            return result

        except RuntimeError as e:
            print(f"Error in similarity aggregate: {str(e)}")
            return embeddings_tensor.mean(dim=0)

    def adaptive_similarity_aggregate(self, embeddings_tensor):
        """
        Memory-efficient adaptive similarity-based aggregation
        """
        try:
            n_tweets, embedding_dim = embeddings_tensor.shape
            print(f"Processing adaptive similarity aggregation for {n_tweets} tweets")

            normalized = embeddings_tensor / (embeddings_tensor.norm(dim=1, keepdim=True) + 1e-8)
            similarities = self._chunked_matrix_multiply(normalized, normalized.t())

            mean_similarities = similarities.mean(dim=1)
            sim_std = mean_similarities.std()
            sim_mean = mean_similarities.mean()

            sim_centered = mean_similarities - sim_mean
            sim_skewness = torch.mean(torch.pow(sim_centered, 3)) / (torch.pow(sim_std, 3) + 1e-8)

            base_temp = 0.1
            variance_factor = 1.0 / (1.0 + sim_std)
            size_factor = torch.log1p(torch.tensor(n_tweets, device=self.device)) / 10.0
            skew_factor = 1.0 / (1.0 + torch.abs(sim_skewness))

            adaptive_temp = base_temp * variance_factor * (1 + size_factor) * skew_factor
            adaptive_temp = torch.clamp(adaptive_temp, min=0.01, max=0.5)

            scaled_similarities = mean_similarities / adaptive_temp
            weights = torch.softmax(scaled_similarities, dim=0)

            result = torch.mm(weights.unsqueeze(0), embeddings_tensor).squeeze(0)
            final_embedding = result / (result.norm() + 1e-8)

            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            elif torch.backends.mps.is_available():
                torch.mps.empty_cache()

            return final_embedding

        except RuntimeError as e:
            print(f"Error in adaptive similarity aggregate: {str(e)}")
            return embeddings_tensor.mean(dim=0)

In [None]:
class OptimizedDataProcessor:
    """
    A class that efficiently processes large datasets with embeddings
    """

    def __init__(self, batch_size=3000, aggregation_method='attention'):
        self.batch_size = batch_size
        self.aggregation_method = aggregation_method
        self.aggregator = OptimizedEmbeddingAggregator(batch_size=batch_size)

    def process_large_dataset(self, filepath, emb_filepath, chunk_size=10000, columns_to_load=None, mode='train'):
        """
        Process a large dataset to aggregate embeddings using the specified method.
        Assumes pre-computed embeddings are stored as numpy arrays.

        :param filepath: Path to the CSV file containing the dataset.
        :type filepath: str
        :param emb_filepath: Path to the file containing pre-computed embeddings as a numpy array.
        :type emb_filepath: str
        :param chunk_size: Number of rows to read at a time from the CSV file;
            defaults to 10000.
        :type chunk_size: int, optional
        :param columns_to_load: List of column names to load from the CSV file; if None,
            all columns are loaded.
        :type columns_to_load: list of str, optional
        :param mode: Operating mode of the function, 'train' or 'test', which
            affects the inclusion of the 'EventType' column; defaults to 'train'.
        :type mode: str, optional

        :return: A pandas DataFrame containing combined statistics of non-embedding features
            along with processed embeddings based on the specified aggregation method.
        :rtype: pd.DataFrame
        """

        # Initial stats containers
        running_stats = {
            'sums': defaultdict(lambda: defaultdict(float)),
            'total_counts': defaultdict(int),
            'first_values': {}
        }

        # Loading embeddings
        print("Memory-mapping embeddings file...")
        embeddings = np.load(emb_filepath, mmap_mode='r')

        # Processing non-embedding features first
        print("Processing non-embedding features...")
        non_embedding_columns = [col for col in columns_to_load if col not in ['BERT_Embedding']]
        chunks_iterator = pd.read_csv(filepath, chunksize=chunk_size, usecols=non_embedding_columns)

        for chunk in tqdm(chunks_iterator):
            # Processing each ID in the chunk
            for id_group, group in chunk.groupby(['ID', 'PeriodID']):
                if id_group not in running_stats['first_values']:
                    running_stats['first_values'][id_group] = {
                        'Is_Key_Period': group['Is_Key_Period'].iloc[0],
                        'EventType': group['EventType'].iloc[0] if mode == 'train' else None,
                        'PeriodID': group['PeriodID'].iloc[0],
                        'ID': group['ID'].iloc[0]
                    }

                n = len(group)
                running_stats['total_counts'][id_group] += n

                # Updating sums for non-embedding features
                for col in ['Sentiment_joy', 'Sentiment_anger', 'Sentiment_fear',
                            'Sentiment_sadness', 'Sentiment_surprise', 'Sentiment_Score',
                            'Exclamation_Count', 'Question_Count', 'Uppercase_Ratio',
                            'Repeated_Char_Word_Ratio', 'Gives_Score']:
                    if col in group.columns:
                        running_stats['sums'][id_group][col] += group[col].sum()

            gc.collect()

        # Process embeddings
        print("\nProcessing embeddings...")
        current_idx = 0
        accumulated_embeddings = defaultdict(list)

        # First pass: Accumulating embeddings
        print("Accumulating embeddings...")
        chunks_iterator = pd.read_csv(filepath, chunksize=self.batch_size,
                                      usecols=['ID', 'PeriodID'])

        for chunk in tqdm(chunks_iterator):
            chunk_size = len(chunk)
            chunk_embeddings = embeddings[current_idx:current_idx + chunk_size]

            for (idx, row), emb in zip(chunk.iterrows(), chunk_embeddings):
                id_key = (row['ID'], row['PeriodID'])
                accumulated_embeddings[id_key].append(emb)

            current_idx += chunk_size
            gc.collect()

        # Second pass: Processing accumulated embeddings
        print("\nProcessing accumulated embeddings...")
        bert_results = {}

        for id_key, emb_list in tqdm(accumulated_embeddings.items()):
            try:
                # Stacking all embeddings for this ID
                print("Stacking embeddings...")
                stacked_embeddings = np.stack(emb_list)
                print(f"Stacked embeddings shape: {stacked_embeddings.shape}")
                print("Creating embeddings tensor...")
                embeddings_tensor = torch.tensor(stacked_embeddings, dtype=torch.float32,
                                                 device=self.aggregator.device)
                print("Finished creating embeddings tensor")

                # Processing using specified aggregation method
                if self.aggregation_method == 'attention':
                    result = self.aggregator.attention_aggregate(embeddings_tensor)
                elif self.aggregation_method == 'similarity':
                    result = self.aggregator.similarity_aggregate(embeddings_tensor)
                elif self.aggregation_method == 'adaptive temperature similarity':
                    result = self.aggregator.adaptive_similarity_aggregate(embeddings_tensor)
                else:
                    raise ValueError(f"Unknown aggregation method: {self.aggregation_method}")

                bert_results[id_key] = result.cpu().numpy()

                # Clearing memory
                del embeddings_tensor
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                elif torch.backends.mps.is_available():
                    torch.mps.empty_cache()

            except Exception as e:
                print(f"Error processing ID {id_key}: {str(e)}")
                continue

        # Clearing accumulated embeddings
        del accumulated_embeddings
        gc.collect()

        # Combining all features
        print("\nCombining features...")
        result_data = []

        for id_group in running_stats['total_counts'].keys():
            count = running_stats['total_counts'][id_group]
            id_val, period_id = id_group

            result_dict = {
                'ID': id_val,
                'PeriodID': period_id,
                'Tweet_Count': count,
                'Is_Key_Period': running_stats['first_values'][id_group]['Is_Key_Period'],
                'EventType': running_stats['first_values'][id_group]['EventType'],
            }

            # Adding averaged non-embedding features
            for metric, sum_value in running_stats['sums'][id_group].items():
                result_dict[metric] = sum_value / count

            # Adding embeddings if available
            if id_group in bert_results:
                result_dict['BERT_Embedding'] = bert_results[id_group]
            else:
                print(f"Warning: Missing embeddings for ID: {id_val}, Period: {period_id}")
                continue

            result_data.append(result_dict)

        return pd.DataFrame(result_data)

    def prepare_full_pipeline(self, train_filepath, test_filepath, train_emb_filepath,
                              test_emb_filepath, columns_to_load, **kwargs):
        """
        Method that is running the complete new embeddings aggregation pipeline
        """
        # Processing train and test data
        print("Processing training data...")
        train_df = self.process_large_dataset(
            train_filepath,
            train_emb_filepath,
            mode='train',
            columns_to_load=columns_to_load + ['EventType']
        )
        train_df.to_csv(f"backup_data/aggregated_df_train_{self.aggregation_method}.csv", index=False)

        print("\nProcessing test data...")
        test_df = self.process_large_dataset(
            test_filepath,
            test_emb_filepath,
            mode='test',
            columns_to_load=columns_to_load
        )
        test_df.to_csv(f"backup_data/aggregated_df_test_{self.aggregation_method}.csv", index=False)

        # Computing lagged features
        print("\nComputing lagged features...")
        train_df_with_lags, bert_cols = compute_lagged_features(train_df)
        test_df_with_lags, _ = compute_lagged_features(test_df)

        # Preparing final datasets
        print("\nPreparing final datasets...")
        X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test = (
            prepare_train_and_test_data(
                train_df_with_lags, test_df_with_lags, bert_cols, **kwargs
            )
        )

        return {
            'X_train_val': X_train_val,
            'X_val': X_val,
            'y_train_val': y_train_val,
            'y_val': y_val,
            'X_train_full': X_train_full,
            'y_train_full': y_train_full,
            'X_test': X_test,
            'Test IDs': test_df_with_lags['ID']
        }

In [None]:
def run_optimized_pipeline(train_filepath, test_filepath, train_emb_filepath, test_emb_filepath, columns_to_load,
                           aggregation_method='attention'):
    """
    Wrapper function to run the new embeddings aggregation pipeline
    """
    processor = OptimizedDataProcessor(
        batch_size=10000,
        aggregation_method=aggregation_method
    )

    try:
        results = processor.prepare_full_pipeline(
            train_filepath=train_filepath,
            test_filepath=test_filepath,
            train_emb_filepath=train_emb_filepath,
            test_emb_filepath=test_emb_filepath,
            columns_to_load=columns_to_load,
            n_components=46
        )

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        elif torch.backends.mps.is_available():
            torch.mps.empty_cache()

        return results

    except RuntimeError as e:
        if "out of memory" in str(e):
            print("\nGPU out of memory. Trying with smaller batch size...")
            # Retrying with smaller batch size
            processor.batch_size = 1500
            results = processor.prepare_full_pipeline(
                train_filepath=train_filepath,
                test_filepath=test_filepath,
                train_emb_filepath=train_emb_filepath,
                test_emb_filepath=test_emb_filepath,
                columns_to_load=columns_to_load,
                n_components=46
            )
            return results
        else:
            raise e

### Preparing Training, Validation and Testing Data

In [None]:
def prepare_train_and_test_data(expanded_df_train, expanded_df_test, n_components=46):
    """
    Preparing data for model training with dimensionality reduction and scaling.
    """
    print("Preparing data for training...")
    # Here, because of the processing function, X_test has 'EventType' column, but it's filled with NaNs
    columns_to_drop = ['ID', 'EventType', 'Match_ID', 'Sentiment_fear'] + (
        ['GloVe_Embedding'] if 'GloVe_Embedding' in expanded_df_train.columns else [])
    X_test = expanded_df_test.drop(columns=columns_to_drop)

    X_train_full = expanded_df_train.drop(columns=columns_to_drop)
    y_train_full = expanded_df_train['EventType']

    X_train_val, X_val, y_train_val, y_val = train_test_split(
        X_train_full, y_train_full, test_size=0.3, random_state=42, stratify=y_train_full
    )

    columns_to_reduce = [col for col in X_train_val.columns if col != 'Is_Key_Period']

    print("Reducing dimensionality...")
    pca_train_val = PCA(n_components=n_components)
    pca_train_full = PCA(n_components=n_components)

    X_train_val_reduced = pca_train_val.fit_transform(
        X_train_val[columns_to_reduce])
    X_train_full_reduced = pca_train_full.fit_transform(
        X_train_full[columns_to_reduce])
    X_val_reduced = pca_train_val.transform(X_val[columns_to_reduce])
    X_test_reduced = pca_train_full.transform(X_test[columns_to_reduce])

    # Replacing original columns with reduced features
    X_train_val_reduced_columns = [f'PCA_{i}' for i in range(len(pca_train_val.explained_variance_ratio_))]
    X_train_full_reduced_columns = [f'PCA_{i}' for i in range(len(pca_train_full.explained_variance_ratio_))]
    X_train_val_df = pd.DataFrame(X_train_val_reduced, columns=X_train_val_reduced_columns,
                                  index=X_train_val.index)
    X_train_full_df = pd.DataFrame(X_train_full_reduced, columns=X_train_full_reduced_columns,
                                   index=X_train_full.index)
    X_val_df = pd.DataFrame(X_val_reduced, columns=X_train_val_reduced_columns, index=X_val.index)
    X_test_df = pd.DataFrame(X_test_reduced, columns=X_train_full_reduced_columns, index=X_test.index)

    X_train_full = pd.concat([X_train_full.drop(columns=columns_to_reduce), X_train_full_df],
                             axis=1)
    X_train_val = pd.concat([X_train_val.drop(columns=columns_to_reduce), X_train_val_df],
                            axis=1)
    X_val = pd.concat([X_val.drop(columns=columns_to_reduce), X_val_df], axis=1)
    X_test = pd.concat([X_test.drop(columns=columns_to_reduce), X_test_df], axis=1)

    # Scaling Features
    print("Scaling features...")
    columns_to_scale = [col for col in X_train_val.columns if col != 'Is_Key_Period']

    scaler = StandardScaler()
    scaler_x_train_full = StandardScaler()
    X_train_full[columns_to_scale] = scaler_x_train_full.fit_transform(X_train_full[columns_to_scale])
    X_test[columns_to_scale] = scaler_x_train_full.transform(X_test[columns_to_scale])
    X_train_val[columns_to_scale] = scaler.fit_transform(X_train_val[columns_to_scale])
    X_val[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

    return X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test

In [None]:
train_filepath = "backup_data/train_preprocessed_data.csv"  # Path to the preprocessed training data
train_emb_filepath = "backup_data/train_bert_embeddings.npy"  # Path to the precomputed BERT embeddings for training data
test_filepath = "backup_data/test_preprocessed_data.csv"  # Path to the preprocessed test data
test_emb_filepath = "backup_data/test_bert_embeddings.npy"  # Path to the precomputed BERT embeddings for test data

In [None]:
columns_to_load = ['ID', 'PeriodID', 'Sentiment_Score', 'Sentiment_anger',
                   'Sentiment_fear', 'Sentiment_joy', 'Sentiment_sadness', 'Sentiment_surprise',
                   'Exclamation_Count', 'Question_Count', 'Uppercase_Ratio', 'Repeated_Char_Word_Ratio',
                   'Is_Key_Period', 'Gives_Score', 'BERT_Embedding']

#### Simple Mean Aggregation of Embeddings

In [None]:
aggregated_df_train = process_large_dataset(train_filepath, columns_to_load=columns_to_load + ['EventType'])

In [None]:
aggregated_df_test = process_large_dataset(test_filepath, columns_to_load=columns_to_load, mode='test')

In [None]:
aggregated_df_train.to_csv("backup_data/aggregated_df_train.csv", index=False)

In [None]:
aggregated_df_test.to_csv("backup_data/aggregated_df_test.csv", index=False)

#### Uncomment the following cell to load the aggregated data from CSV files

In [None]:
# aggregated_df_train = pd.read_csv("backup_data/aggregated_df_train.csv")
# aggregated_df_test = pd.read_csv("backup_data/aggregated_df_test.csv")

In [None]:
loaded_from_csv = False  # Flag to indicate wether the data was loaded from CSV

In [None]:
aggregated_df_train, bert_columns = compute_lagged_features(aggregated_df_train, loaded_df_from_csv=loaded_from_csv)

In [None]:
aggregated_df_test, _ = compute_lagged_features(aggregated_df_test, loaded_df_from_csv=loaded_from_csv)

In [None]:
aggregated_df_train.head()

In [None]:
#Preparing data for training
X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test = prepare_train_and_test_data(
    aggregated_df_train, aggregated_df_test, n_components=46
)

#### Uncomment the following cell to try the other embeddings aggregation pipeline (Attention, Similarity or Adaptive Temperature Similarity)

In [None]:
#aggregation_method = 'attention'

In [None]:
# datasets = run_optimized_pipeline(train_filepath, test_filepath, train_emb_filepath, test_emb_filepath,
#                                             columns_to_load, aggregation_method)

In [None]:
#X_train_val, X_val, y_train_val, y_val, X_train_full, y_train_full, X_test, test_ids = datasets.values()

In [None]:
X_train_full.head()

In [None]:
X_train_val.shape, X_val.shape, X_train_full.shape, X_test.shape

## Baseline Models

In [None]:
def get_cross_val_scores(model, X, y):
    """
    Get cross-validation scores for a given model.
    """
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Mean Cross validation Accuracy: {scores.mean()}")
    print("Individual Cross validation scores: ", scores)


def evaluate_model(model, X, y):
    """Evaluate a trained model on provided data."""
    predictions = model.predict(X)

    print("Results:")
    print(classification_report(y, predictions))

### Logistic Regression

In [None]:
logistic_model = LogisticRegression(
    max_iter=1000,
    penalty='l2',
    C=1.0,
    random_state=42,
    class_weight='balanced'
)

In [None]:
get_cross_val_scores(logistic_model, X_train_val, y_train_val)

In [None]:
logistic_model.fit(X_train_val, y_train_val)

In [None]:
evaluate_model(logistic_model, X_train_val, y_train_val)

In [None]:
evaluate_model(logistic_model, X_val, y_val)

### Random Forest

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42
)

In [None]:
get_cross_val_scores(rf_model, X_train_val, y_train_val)

In [None]:
rf_model.fit(X_train_val, y_train_val)

In [None]:
evaluate_model(rf_model, X_train_val, y_train_val)

In [None]:
evaluate_model(rf_model, X_val, y_val)

### LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    num_leaves=31,
    min_child_samples=30,
    min_child_weight=1e-3,
    reg_alpha=0.1,
    reg_lambda=0.3,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    random_state=42
)

In [None]:
get_cross_val_scores(lgb_model, X_train_val, y_train_val)

In [None]:
lgb_model.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],
    eval_metric='logloss',
    callbacks=[early_stopping(stopping_rounds=50)]  # Early stopping callback
)

In [None]:
evaluate_model(lgb_model, X_train_val, y_train_val)

In [None]:
evaluate_model(lgb_model, X_val, y_val)

### XGBoost

In [None]:
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    min_child_weight=5,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_alpha=0.1,
    reg_lambda=0.3,
    gamma=1,
    eval_metric='logloss',
    random_state=42
)

In [None]:
get_cross_val_scores(xgb_model, X_train_val, y_train_val)

In [None]:
xgb_model.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],  # Early stopping on the validation set
    verbose=False
)

In [None]:
evaluate_model(xgb_model, X_train_val, y_train_val)

In [None]:
evaluate_model(xgb_model, X_val, y_val)

## Hyperparameter Optimization

In [None]:
def optimize_logistic_regression(X_train, y_train, cv=5, n_jobs=-1):
    """
    Performing hyperparameter optimization for Logistic Regression.

    Args:
        X_train: Training features
        y_train: Training labels
        cv: Number of cross-validation folds
        n_jobs: Number of parallel jobs (-1 for all processors)
    """
    # Defining scoring metrics
    scoring = {
        'f1': 'f1_weighted',
        'accuracy': 'accuracy'
    }

    # Defining parameter grid with valid solver-penalty combinations
    param_grid = [
        # SAGA solver, supports all penalties including elasticnet
        {
            'solver': ['saga'],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
            'C': np.logspace(-4, 4, 20),
            'class_weight': ['balanced', None, custom_weights],
            'max_iter': [2000],
            'tol': [1e-4]
        },
        # LBFGS solve, only supports l2 penalty
        {
            'solver': ['lbfgs'],
            'penalty': ['l2'],
            'C': np.logspace(-4, 4, 20),
            'class_weight': ['balanced', None, custom_weights],
            'max_iter': [2000],
            'tol': [1e-4]
        },
        # LIBLINEAR solver, supports l1 and l2 penalties
        {
            'solver': ['liblinear'],
            'penalty': ['l1', 'l2'],
            'C': np.logspace(-4, 4, 20),
            'class_weight': ['balanced', None, custom_weights],
            'max_iter': [2000],
            'tol': [1e-4]
        }
    ]

    grid_search = GridSearchCV(
        estimator=LogisticRegression(random_state=42),
        param_grid=param_grid,
        cv=cv,
        n_jobs=n_jobs,
        scoring=scoring,
        return_train_score=True,
        refit='accuracy'
    )

    grid_search.fit(X_train, y_train)

    print("\nBest parameters found:")
    for param, value in grid_search.best_params_.items():
        print(f"{param}: {value}")
    print(f"\nBest cross-validation score: {grid_search.best_score_:.4f}")

    results_df = pd.DataFrame(grid_search.cv_results_)

    return grid_search.best_estimator_, results_df

In [None]:
logistic_model_optimized, logistic_results_df = optimize_logistic_regression(X_train_val, y_train_val)

In [None]:
counter_y_train_full = Counter(y_train_full)
counter_y_train_full

In [None]:
custom_weights = {
    0: counter_y_train_full[1] / counter_y_train_full[0],
    1: 1
}

In [None]:
def get_param_grid(model_name):
    """
    Getting parameter grid for a given model among 'rf', 'lgb' and 'xgb'

    Args:
        model_name: Name of the model ('rf', 'lgb', 'xgb', or 'lr')
    Returns:
        Parameter grid dictionary or list of dictionaries
    """
    class_weight_options = [custom_weights] + [None, 'balanced']

    param_grids = {
        'rf': {
            'n_estimators': [50, 100],
            'max_depth': [3, 4, 5],
            'min_samples_split': [10, 20, 30],
            'min_samples_leaf': [8, 16, 24],
            'max_features': ['sqrt', 'log2'],
            'class_weight': ['balanced', custom_weights, None]
        },
        'lgb': {
            'n_estimators': [50, 100],
            'max_depth': [3, 4],
            'learning_rate': [0.01],
            'num_leaves': [7, 15],
            'min_child_samples': [40, 80],
            'colsample_bytree': [0.6, 0.8],
            'is_unbalance': [True, False],
            'class_weight': class_weight_options,
            'reg_alpha': [0.1, 0.5],
            'reg_lambda': [0.1, 0.5]
        },
        'xgb': {
            'n_estimators': [50, 100],
            'max_depth': [3, 4],
            'learning_rate': [0.01],
            'min_child_weight': [3, 5],
            'subsample': [0.6, 0.8],
            'gamma': [0.3, 0.5],
            'scale_pos_weight': [None, 1 / custom_weights[0]],
            'reg_alpha': [0.1, 0.5],
            'reg_lambda': [0.1, 0.5]
        }
    }
    return param_grids[model_name]


def optimize_model(X_train, y_train, model_type, cv=5, n_jobs=-1):
    """
    Performing hyperparameter optimization for various models.

    Args:
        X_train: Training features
        y_train: Training labels
        model_type: Type of model ('rf', 'lgb'  or 'xgb')
        cv: Number of cross-validation folds
        n_jobs: Number of parallel jobs (-1 for all processors)
    """
    model_classes = {
        'rf': RandomForestClassifier(random_state=42),
        'lgb': lgb.LGBMClassifier(random_state=42),
        'xgb': xgb.XGBClassifier(random_state=42),
    }

    param_grid = get_param_grid(model_type)

    scoring = {
        'f1': 'f1_weighted',
        'accuracy': 'accuracy'
    }

    grid_search = GridSearchCV(
        estimator=model_classes[model_type],
        param_grid=param_grid,
        cv=cv,
        n_jobs=n_jobs,
        scoring=scoring,
        return_train_score=True,
        refit='accuracy',
    )

    grid_search.fit(X_train, y_train)

    print(f"\nBest {model_type.upper()} parameters found:")
    for param, value in grid_search.best_params_.items():
        print(f"{param}: {value}")
    print(f"\nBest cross-validation score: {grid_search.best_score_:.4f}")

    return grid_search.best_estimator_, pd.DataFrame(grid_search.cv_results_)

In [None]:
models_to_optimize = ['rf', 'lgb', 'xgb']
model_results = {}

for model_type in models_to_optimize:
    print(f"\nOptimizing {model_type.upper()}...")
    best_model, results = optimize_model(X_train_val, y_train_val, model_type)
    model_results[model_type] = (best_model, results)

In [None]:
comparison_df = pd.concat(
    [model_results[model_type][1].assign(model=model_type.upper()) for model_type in models_to_optimize]
)

In [None]:
comparison_df

### Random Forest

In [None]:
rf_model_optimized = model_results['rf'][0]

In [None]:
get_cross_val_scores(rf_model_optimized, X_train_val, y_train_val)

In [None]:
rf_model_optimized.fit(X_train_val, y_train_val)

In [None]:
evaluate_model(rf_model_optimized, X_train_val, y_train_val)

In [None]:
evaluate_model(rf_model_optimized, X_val, y_val)

### LightGBM

In [None]:
lgb_model_optimized = model_results['lgb'][0]

In [None]:
get_cross_val_scores(lgb_model_optimized, X_train_val, y_train_val)

In [None]:
lgb_model_optimized.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],
    eval_metric='logloss',
    callbacks=[early_stopping(stopping_rounds=50)]
)

In [None]:
evaluate_model(lgb_model_optimized, X_train_val, y_train_val)

In [None]:
evaluate_model(lgb_model_optimized, X_val, y_val)

### XGBoost

In [None]:
xgb_model_optimized = model_results['xgb'][0]

In [None]:
get_cross_val_scores(xgb_model_optimized, X_train_val, y_train_val)

In [None]:
xgb_model_optimized.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],
    verbose=False
)

In [None]:
evaluate_model(xgb_model_optimized, X_train_val, y_train_val)

In [None]:
evaluate_model(xgb_model_optimized, X_val, y_val)

### Submission

#### Training on the whole training set before submission

In [None]:
best_model = logistic_model

In [None]:
best_model.fit(
    X_train_full, y_train_full,
)

In [None]:
predictions = best_model.predict(X_test)

In [None]:
pred_df = pd.DataFrame({'ID': aggregated_df_test['ID'], 'EventType': predictions})

In [None]:
# Sorting the final DataFrame by the split components of ID
pred_df['ID_First'] = pred_df['ID'].str.split('_').str[0].astype(int)
pred_df['ID_Second'] = pred_df['ID'].str.split('_').str[1].astype(int)

pred_df = pred_df.sort_values(by=['ID_First', 'ID_Second']).reset_index(drop=True)

pred_df.drop(columns=['ID_First', 'ID_Second'], inplace=True)

In [None]:
pred_df

In [None]:
submission_file = "best_logistic_predictions.csv"
pred_df.to_csv(submission_file, index=False)

In [None]:
def save_model(model, filepath, create_dir=True):
    """
    Saving the trained model to disk
    """
    directory = os.path.dirname(filepath)
    if directory and create_dir:
        os.makedirs(directory, exist_ok=True)

    joblib.dump(model, filepath)


def load_model(filepath):
    """
    Loading the saved model from disk
    """
    return joblib.load(filepath)

In [None]:
# Saving the best model
model_filepath = 'models/best_logistic_regression_model.joblib'
save_model(best_model, model_filepath)