# PREPROCESSING

This file is used to create versions of preprocessment, for the modelling stage. It includes the treatment of missing values, inconsistent values, as well as feature engineering, both elementary and advanced. It does not include, scaling, outlier removal or feature selection of any kind.

Each transformation is performed both on train, as well as test data - wherever applicable. This script was written with the assistance of artificial tools, to generate boilerplates, and correct grammar; the authors attest to its correctness with penalty of grade.

DISCLAIMER: The steps taken in this notebook to treat the data do not make a superlative effort to justify or demonstrate themselves. Look instead to the Data Exploration notebook, that goes into lengthy detail over the reasoning behind each preprocessment choice. 

## Libraries

Libraries used in the elaboration of this notebook.

In [1]:
# STL
import os 
import re
import random
import math
from collections import Counter

# Ingestion and Manipulation
import json
import numpy as np
import pandas as pd

# fastext
import fasttext 

# Display options
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

## Helper Functions

Assist in the inspection and execution of preprocessment.

### to_snake_case

Converts a string to snake_case.

In [2]:
def to_snake_case(input_string):
    """
    Converts a given string to snake_case.

    Args:
        input_string (str): The string to convert.

    Returns:
        str: The snake_case version of the input string.
    """
    # Replace spaces or hyphens with underscores
    input_string = re.sub(r"[\s\-\/]+", "_", input_string)
    
    # Add underscores between camelCase or PascalCase words
    input_string = re.sub(r"([a-z])([A-Z])", r"\1_\2", input_string)
    
    # Convert to lowercase
    snake_case_string = input_string.lower()
    
    return snake_case_string

## Data Ingestion

Data was loaded and our prefered naming convention (PEP-8 - snake_case) was adopted.

In [3]:
# Ingest at the working directory
train = pd.read_csv('../data/train_data.csv', low_memory=False)

# Convert variables to snake case
train.rename(columns={colname : to_snake_case(colname) for colname in train.columns}, inplace=True, errors='ignore')

In [4]:
# Ingest at working directory
test = pd.read_csv('../data/test_data.csv', low_memory=False)

# Convert variables to snake case
test.rename(columns={colname : to_snake_case(colname) for colname in test.columns}, inplace=True, errors='ignore')

## Reindexing the dataframe with Claim Identifier

Converted to index, as it is the natural index for the data, to this end, both duplicates and inconsistent values were dropped.

In [5]:
# Dropping duplicates
train.drop_duplicates(subset=['claim_identifier'], inplace=True)

In [6]:
# Boolean mask to select the values of interest
id_mask_df = train[train['claim_identifier'].astype(str).str.len() != 7]
train = train.drop(id_mask_df.index)

print(f'Droping: {id_mask_df.shape[0]} rows')

Droping: 19444 rows


In [7]:
# claim_identifier is set as index to train dataframe
train = train.set_index(keys='claim_identifier')

In [8]:
# claim_identifier is set as index to test dataframe
test = test.set_index(keys='claim_identifier')

## Dropping useless columns

Columns which it was immediately evident will serve no purpose were dropped.

In [9]:
# Not present in test, empty column, unary
train = train.drop(columns=['agreement_reached', 'oiics_nature_of_injury_description', 'wcb_decision'])

In [10]:
# Empty column, unary 
test = test.drop(columns=['oiics_nature_of_injury_description'])

In [11]:
# We also cast the feature itself to a category
train['claim_injury_type'] = train['claim_injury_type'].astype('category')

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 574026 entries, 5393875 to 6165075
Data columns (total 29 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   accident_date                      570337 non-null  object  
 1   age_at_injury                      574026 non-null  float64 
 2   alternative_dispute_resolution     574026 non-null  object  
 3   assembly_date                      574026 non-null  object  
 4   attorney_representative            574026 non-null  object  
 5   average_weekly_wage                545375 non-null  float64 
 6   birth_year                         544948 non-null  float64 
 7   c_2_date                           559466 non-null  object  
 8   c_3_date                           187245 non-null  object  
 9   carrier_name                       574026 non-null  object  
 10  carrier_type                       574026 non-null  object  
 11  claim_injury_type       

# WordEmbedding

Using Fasttext and Optuna

## SMOTING

Trying to use smote to create a more balanced dataset

##### We first build our corpus

In [14]:
import fasttext
import numpy as np
import re
import pandas as pd

# 1. Preprocessing Functions
def remove_numbers_punctuation(text):
    """Remove numbers and punctuation from the text."""
    return re.sub(r'[^a-zA-Z\s_]', '', text)

def to_lowercase(text):
    """Convert text to lowercase."""
    return text.lower()

def to_snake_case(text):
    """Convert text to snake_case."""
    words = text.split()
    return '_'.join(words).lstrip('_')

# 2. FastText Embedding Functions
def extract_embeddings(sentence, model):
    """Extract the word embeddings for a given sentence."""
    words = sentence.split()  # Tokenize sentence into words
    word_vectors = []

    # Get word embeddings for each word
    for word in words:
        try:
            word_vector = model.get_word_vector(word)  # Get word vector from FastText model
            word_vectors.append(word_vector)
        except KeyError:
            continue  # Skip words not in the model's vocabulary
    
    # If no valid word vectors, return None
    if not word_vectors:
        return None
    return np.array(word_vectors)

def compute_embedding_features(word_vectors):
    """Compute the average embedding, variance, and Euclidean norm from word vectors."""
    if word_vectors is None or len(word_vectors) == 0:
        return np.zeros(300), np.zeros(300), 0  # Assuming 300 dimensions for FastText embeddings
    
    # Compute the average embedding (mean across words)
    average_embedding = np.mean(word_vectors, axis=0)
    
    # Compute the variance per dimension
    variance_embedding = np.var(word_vectors, axis=0)
    
    # Compute the Euclidean norm of the average embedding
    euclidean_norm = np.linalg.norm(average_embedding)
    
    return average_embedding, variance_embedding, euclidean_norm

# 3. Main Processing Pipeline
def process_data(train, columns_to_embed, model_path=r'../models/dbpedia.bin'):
    """Preprocess and extract features for the input data using a pretrained FastText model."""
    # Load your FastText model (pretrained, e.g., dbpedia or other embeddings)
    model = fasttext.load_model(model_path)
    
    # Select the features from the training data
    data = train[columns_to_embed].copy()

    # Preserve the original 'claim_identifier' from the train index
    data['claim_identifier'] = train.index  # Using train.index as the 'claim_identifier'

    # Apply text transformations only to non-missing rows
    for col in data.columns:
        if col != 'claim_identifier':  # Don't transform the 'claim_identifier' column
            for idx, value in data[col].items():
                try:
                    if pd.notna(value):  # Check if the value is not NaN
                        value = remove_numbers_punctuation(value)
                        value = to_lowercase(value)
                        data.at[idx, col] = value  # Update the value in the DataFrame
                except Exception as e:
                    print(f"Error processing value '{value}' in column '{col}': {e}")
                    data.at[idx, col] = np.nan  # If error occurs, set value to NaN
                    
    # Concatenate features to prepare for FastText input
    data['text_features'] = data.apply(
        lambda row: ' '.join(str(val) for val in row if val != 'claim_identifier' and pd.notna(val)), 
        axis=1
    )
    
    # Extract word embeddings row-wise
    def compute_features_for_row(row):
        try:
            word_vectors = extract_embeddings(row['text_features'], model)
            return compute_embedding_features(word_vectors)
        except Exception as e:
            print(f"Error computing features for row: {e}")
            dimension = model.get_dimension()
            return (np.zeros(dimension), np.zeros(dimension), 0)

    # Apply row-wise extraction of embeddings and features
    data['embedding_features'] = data.apply(compute_features_for_row, axis=1)
    
    # Separate average embedding dimensions into individual columns
    avg_dim_columns = [f'avg_word_emb_dim_{i}' for i in range(model.get_dimension())]
    data[avg_dim_columns] = pd.DataFrame(data['embedding_features'].map(lambda x: x[0]).tolist(), index=data.index)
    
    # Separate variance embedding dimensions into individual columns
    var_dim_columns = [f'var_word_emb_dim_{i}' for i in range(model.get_dimension())]
    data[var_dim_columns] = pd.DataFrame(data['embedding_features'].map(lambda x: x[1]).tolist(), index=data.index)
    
    # Add Euclidean norm as a single column
    data['euclidean_norm'] = data['embedding_features'].map(lambda x: x[2])
    
    # Drop intermediate columns
    data = data.drop(columns=['text_features', 'embedding_features', 'claim_identifier'] + columns_to_embed)
    
    return data

# 4. Deploy
columns_to_embed = [
    'wcio_cause_of_injury_description',
    'wcio_part_of_body_description',
    'wcio_nature_of_injury_description',
    'industry_code_description',
    'carrier_type'
]

# Process the data and extract the required features
processed_data = process_data(train, columns_to_embed)

# Concatenate the original `train` DataFrame with `processed_data`
train = pd.concat([train, processed_data], axis=1)
train.drop(columns=columns_to_embed, inplace=True)

train.head()

