### Dependencies

https://drivendata.co/blog/automated-abstraction-benchmark

In [11]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
from loguru import logger
import accelerate
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import bitsandbytes as bnb
import torch
from huggingface_hub import login

### Setting dirs 

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


In [4]:
DATA_DIR = Path("D:\Donnees\Desktop\AI\DrivenDataComp2\data") # replace with path to your data
features = pd.read_csv(
    DATA_DIR / "train_features.csv", index_col="uid"
)
labels = pd.read_csv(
    DATA_DIR / "train_labels.csv", index_col="uid"
)
submission_format = pd.read_csv(
    DATA_DIR / "submission_format.csv", index_col="uid"
)

from the comptetition's blogpost: In this notebook, we'll ignore NarrativeCME and use only NarrativeLE for simplicity. You may want to explore how better to consolidate information across these fields.


### Data Exploration

In [5]:
# explore feature data
features.shape

(4000, 2)

In [6]:
features.NarrativeLE.str.len().describe()


count    4000.000000
mean      941.545750
std       692.546272
min       183.000000
25%       497.000000
50%       774.000000
75%      1174.250000
max      7487.000000
Name: NarrativeLE, dtype: float64

In [7]:
# explore labels
labels.describe().T[["mean", "50%", "min", "max"]]

Unnamed: 0,mean,50%,min,max
DepressedMood,0.328,0.0,0.0,1.0
MentalIllnessTreatmentCurrnt,0.2585,0.0,0.0,1.0
HistoryMentalIllnessTreatmnt,0.3725,0.0,0.0,1.0
SuicideAttemptHistory,0.2095,0.0,0.0,1.0
SuicideThoughtHistory,0.4095,0.0,0.0,1.0
SubstanceAbuseProblem,0.229,0.0,0.0,1.0
MentalHealthProblem,0.48725,0.0,0.0,1.0
DiagnosisAnxiety,0.13375,0.0,0.0,1.0
DiagnosisDepressionDysthymia,0.36225,0.0,0.0,1.0
DiagnosisBipolar,0.0655,0.0,0.0,1.0


### Model

We will use the relatively lightweight Mistral-7B-Instruct-v0.2 model LLM for our solution.

In [8]:
MODEL_DIR = Path("assets")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [23]:
def save_model(device, model_name="mistralai/Mistral-7B-Instruct-v0.2"):
    logger.info(f"Using device {device} to save model to {MODEL_DIR}")

    # use 4-bit quantization
    # quantization_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_compute_dtype=torch.float16,
    # )

    logger.info("Downloading model")
    # model = AutoModelForCausalLM.from_pretrained(
    #     model_name, quantization_config=quantization_config, device_map=device
    # )
    model = AutoModelForCausalLM.from_pretrained(
        model_name, device_map=device
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    logger.info(f"Saving model to {MODEL_DIR}")
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    logger.success("Model and tokenizer saved")

In [24]:
login("token here")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\Lenovo\.cache\huggingface\token
Login successful


In [25]:
if not (MODEL_DIR / "config.json").exists():
    logger.info("Downloading model")
    save_model(DEVICE)
else:
    logger.info("Using existing local model")

[32m2024-09-20 13:50:42.780[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mDownloading model[0m
[32m2024-09-20 13:50:42.784[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_model[0m:[36m2[0m - [1mUsing device cpu to save model to assets[0m
[32m2024-09-20 13:50:42.786[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_model[0m:[36m10[0m - [1mDownloading model[0m
Downloading shards: 100%|██████████| 3/3 [1:11:03<00:00, 1421.22s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [01:03<00:00, 21.20s/it]
[32m2024-09-20 15:02:53.731[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_model[0m:[36m19[0m - [1mSaving model to assets[0m
[32m2024-09-20 15:11:42.632[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_model[0m:[36m22[0m - [32m[1mModel and tokenizer saved[0m


In [26]:
logger.info(f"Loading model from {MODEL_DIR}, {MODEL_DIR.exists()}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR, device_map=DEVICE, local_files_only=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

[32m2024-09-20 15:13:18.250[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading model from assets, True[0m
Loading checkpoint shards: 100%|██████████| 6/6 [00:02<00:00,  2.78it/s]


### Building prompt

We've provided a basic prompt here that defines:

A role - an abstractor that takes narratives and returns values for the binary and categorical variables 

Lists of the variables we expect output for

Options for the variable values

Expected output format

Example input and output

In [27]:
PROMPT_TEMPLATE = """You are an expert abstractor who reads law enforcement narratives about youth suicide and extracts variables that represent common patterns. The variables you are extracting are either binary (0 or 1) or categorical. Use the example input and output for the list of all variables to return.

There are two categorical variables, specified below. For categorical variables, return ONE of the possible values specified in the semicolon-separated list.
VARIABLE: InjuryLocationType
- POSSIBLE VALUES: House, apartment; Motor vehicle (excluding school bus and public transportation); Natural area (e.g., field, river, beaches, woods); Street/road, sidewalk, alley; Park, playground, public use area; Other
VARIABLE: WeaponType1
- POSSIBLE VALUES: Firearm; Hanging, strangulation, suffocation; Poisoning; Fall; Other transport vehicle, eg, trains, planes, boats; Motor vehicle including buses, motorcycles; Drowning; Sharp instrument; Fire or burns; Blunt instrument; Unknown; Other (e.g. taser, electrocution, nail gun) 

All other variables are binary. For binary variables, Return a 0 if the item represented by the variable is absent and 1 if the item represented by the variable is present. The binary variables are:
- DepressedMood
- MentalIllnessTreatmentCurrnt
- HistoryMentalIllnessTreatmnt
- SuicideAttemptHistory
- SuicideThoughtHistory
- SubstanceAbuseProblem
- MentalHealthProblem
- DiagnosisAnxiety
- DiagnosisDepressionDysthymia
- DiagnosisBipolar
- DiagnosisAdhd
- IntimatePartnerProblem
- FamilyRelationship
- Argument
- SchoolProblem
- RecentCriminalLegalProblem
- SuicideNote
- SuicideIntentDisclosed
- DisclosedToIntimatePartner
- DisclosedToOtherFamilyMember
- DisclosedToFriend

You should output properly formatted json object where the keys are variable names and the values are predicted values for the given narrative. Do NOT output anything other than the JSON object. Do not include any explanation or summaries. Do not include any keys in this json object that aren't specified in the list.
-------------
EXAMPLE INPUT:
XX XX V found deceased at home by his grandparents, hanging from a basketball hoop in his basement family room. According to LE, a check of V's cell phone revealed that V had made suicidal statements by phone earlier. In the text message V sent to his girlfriend, he had stated that he was going to hang himself.

EXAMPLE OUTPUT:
{{"DepressedMood": 0,
 "MentalIllnessTreatmentCurrnt": 0,
 "HistoryMentalIllnessTreatmnt": 0,
 "SuicideAttemptHistory": 0,
 "SuicideThoughtHistory": 0,
 "SubstanceAbuseProblem": 0,
 "MentalHealthProblem": 0,
 "DiagnosisAnxiety": 0,
 "DiagnosisDepressionDysthymia": 0,
 "DiagnosisBipolar": 0,
 "DiagnosisAdhd": 0,
 "IntimatePartnerProblem": 0,
 "FamilyRelationship": 0,
 "Argument": 0,
 "SchoolProblem": 0,
 "RecentCriminalLegalProblem": 0,
 "SuicideNote": 0,
 "SuicideIntentDisclosed": 1,
 "DisclosedToIntimatePartner": 1,
 "DisclosedToOtherFamilyMember": 0,
 "DisclosedToFriend": 0,
 "InjuryLocationType": "House, apartment",
 "WeaponType1": "Hanging, strangulation, suffocation"
}}
-------------
INPUT:
{}

OUTPUT:
"""

### Data Preprocessing

we'll be batching our inputs in order to speed up prediction time (not all LLM pipelines will use batching)

In [None]:
def process_features(features):
    """
    Order features by ascending string length
    """
    features["str_len"] = features.NarrativeLE.str.len()
    features = features.sort_values(by="str_len")
    return features.drop(columns=["str_len"])

In [None]:
def batch_features(features, batch_size: int):
    """
    Batch features together
    """
    if len(features) > batch_size:
        return np.array_split(features, int(len(features) / batch_size))
    return [features]

A padding token is used to fill shorter inputs in a batch to match the longest input's size, which ensures consistency in input size. Here, we're just using the end-of-sequence token as the padding token.

In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

### Generating model output

In [None]:
BATCH_SIZE = 10
MAX_NEW_TOKENS=300

In [None]:
def predict_on_batch(feature_batch, model, tokenizer):
    """
    Tokenize input batches, generate and decode outputs
    """
    # Tokenize input narratives (NarrativeLE) in batch
    prompts = [PROMPT_TEMPLATE.format(nar) for nar in feature_batch.NarrativeLE]
    inputs = tokenizer(prompts, return_tensors="pt", padding=True)
    inputs.to("cuda")

    # Generate outputs for variables
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=MAX_NEW_TOKENS,
    )
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Remove prompt from output
    decoded = [resp[len(prompt) :] for resp, prompt in zip(decoded, prompts)]

    return decoded

In [None]:
# Batch inputs - note - we're running an example of 5 in this notebook
df = process_features(features.iloc[:5])
data_batches = batch_features(df, BATCH_SIZE)

responses = []
idxs = []
logger.info(f"Iterating over {len(data_batches)} batches")
for ix, data_batch in enumerate(data_batches):
    logger.info(f"Generating predictions on batch {ix}, with {len(data_batch)} samples")
    responses += predict_on_batch(data_batch, model, tokenizer)
    idxs += list(data_batch.index)
logger.info(f"Finished inference")
interim_preds = pd.DataFrame({"string_output": responses}, index=df.index)

In [None]:
print(responses[0])

### Parse model outputs into submission-ready format

In [None]:
def parse_response(output):
    """
    Transform response into a json object using minimal cleaning
    """
    try:
        # Try loading the raw string into 
        resp = json.loads(output)
        return resp
    except json.JSONDecodeError:
        pass
    try:
        # Get rid of extra trailing sections that follow "--"
        split_output = output.split("--")[0]
        resp = json.loads(split_output)
        return resp
    except json.JSONDecodeError:
        pass
    try:
        # Get rid of sections that follow the a closing bracket "}"
        split_output = output.split("}")[0] + "}"
        resp = json.loads(split_output)
        return resp
    except json.JSONDecodeError:
        logger.warning(f"Failed to parse {output} into valid json")
        return None

In [None]:
def process_injury_location(data: pd.Series):
    """
    Transform InjuryLocationType model output to integers, 
    fill in default for invalid outputs
    """
    ilt = data.map(
        {
            "House, apartment": 1,
            "Motor vehicle (excluding school bus and public transportation)": 2,
            "Natural area (e.g., field, river, beaches, woods)": 3,
            "Park, playground, public use area": 4,
            "Street/road, sidewalk, alley": 5,
            "Other": 6,
            "Residence": 1,
            "Apartment": 1,
        }
    )
    if ilt.isna().any():
        logger.warning(
            f"There are unexpected values in injury location: {data[ilt.isna()].unique()} "
        )
        ilt = ilt.fillna(6)  # Fill with other

    return ilt.astype(int)


def process_weapon_type(data: pd.Series):
    """
    Transform WeaponType1 model output to integers, 
    fill in default for invalid outputs
    """
    wt = data.map(
        {
            "Blunt instrument": 1,
            "Drowning": 2,
            "Fall": 3,
            "Fire or burns": 4,
            "Firearm": 5,
            "Hanging, strangulation, suffocation": 6,
            "Motor vehicle including buses, motorcycles": 7,
            "Other transport vehicle, eg, trains, planes, boats": 8,
            "Poisoning": 9,
            "Sharp instrument": 10,
            "Other (e.g. taser, electrocution, nail gun)": 11,
            "Unknown": 12,
        }
    )
    if wt.isna().any():
        logger.warning(
            f"There are unexpected values in weapon type: {data[wt.isna()].unique()} "
        )
        wt = wt.fillna(11)  # Fill with other

    return wt.astype(int)

In [None]:
idxs = []
parsed_resps = []
could_not_parse = []

for row in interim_preds.itertuples():
    parsed = parse_response(row.string_output)
    if type(parsed) == dict:
        idxs.append(row.Index)
        parsed_resps.append(parsed)
    else:
        idxs.append(row.Index)
        could_not_parse.append(row.Index)
        # Fill any we couldn't parse with placeholder values for now
        parsed_resps.append(
            {
                "DepressedMood": 0,
                "IntimatePartnerProblem": 0,
                "FamilyRelationship": 0,
                "Argument": 0,
                "MentalIllnessTreatmentCurrnt": 0,
                "HistoryMentalIllnessTreatmnt": 0,
                "SuicideAttemptHistory": 0,
                "SuicideThoughtHistory": 0,
                "SuicideNote": 0,
                "SubstanceAbuseProblem": 0,
                "SchoolProblem": 0,
                "RecentCriminalLegalProblem": 0,
                "SuicideIntentDisclosed": 0,
                "DisclosedToIntimatePartner": 0,
                "DisclosedToOtherFamilyMember": 0,
                "DisclosedToFriend": 0,
                "MentalHealthProblem": 0,
                "DiagnosisAnxiety": 0,
                "DiagnosisDepressionDysthymia": 0,
                "DiagnosisBipolar": 0,
                "DiagnosisAdhd": 0,
                "WeaponType1": "Unknown",
                "InjuryLocationType": "Other",
            }
        )

if len(could_not_parse) > 0:
    logger.warning(
        f"Could not parse {len(could_not_parse)} rows. Indices: {could_not_parse}"
    )

parsed_preds = pd.DataFrame(parsed_resps, index=pd.Index(idxs, name="uid")).fillna(0)
parsed_preds["InjuryLocationType"] = process_injury_location(
    parsed_preds.InjuryLocationType
)
parsed_preds["WeaponType1"] = process_weapon_type(parsed_preds.WeaponType1)

# Make sure the column order is the same as in the submission format
parsed_preds = parsed_preds[submission_format.columns]

# Make sure the row order is the same as in the submission format
parsed_preds = parsed_preds.loc[features[:5].index]

# Make sure all values are int
parsed_preds = parsed_preds.round().astype(int)

In [None]:
# Columns are in the correct order
assert (submission_format.columns == parsed_preds.columns).all().all()

In [None]:
# All columns are of type int
assert (parsed_preds.dtypes == int).all()

In [None]:
# Variables have values within the expected range
assert parsed_preds.iloc[:, 0:-2].isin([0, 1]).all().all()
assert (parsed_preds["InjuryLocationType"].isin(range(1, 7))).all()
assert (parsed_preds["WeaponType1"].isin(range(1, 13))).all()