### Dependencies

In [1]:
import json
from pathlib import Path

import pandas as pd
import numpy as np
from loguru import logger
import accelerate
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
import bitsandbytes as bnb
import torch

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


### Setting dirs 

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cpu


In [6]:
DATA_DIR = Path("D:\Donnees\Desktop\AI\DrivenDataComp2\data") # replace with path to your data
features = pd.read_csv(
    DATA_DIR / "train_features.csv", index_col="uid"
)
labels = pd.read_csv(
    DATA_DIR / "train_labels.csv", index_col="uid"
)
submission_format = pd.read_csv(
    DATA_DIR / "submission_format.csv", index_col="uid"
)

from the comptetition's blogpost: In this notebook, we'll ignore NarrativeCME and use only NarrativeLE for simplicity. You may want to explore how better to consolidate information across these fields.


### Data Preprocessing

In [7]:
# explore feature data
features.shape

(4000, 2)

In [9]:
features.NarrativeLE.str.len().describe()


count    4000.000000
mean      941.545750
std       692.546272
min       183.000000
25%       497.000000
50%       774.000000
75%      1174.250000
max      7487.000000
Name: NarrativeLE, dtype: float64

In [10]:
# explore labels
labels.describe().T[["mean", "50%", "min", "max"]]

Unnamed: 0,mean,50%,min,max
DepressedMood,0.328,0.0,0.0,1.0
MentalIllnessTreatmentCurrnt,0.2585,0.0,0.0,1.0
HistoryMentalIllnessTreatmnt,0.3725,0.0,0.0,1.0
SuicideAttemptHistory,0.2095,0.0,0.0,1.0
SuicideThoughtHistory,0.4095,0.0,0.0,1.0
SubstanceAbuseProblem,0.229,0.0,0.0,1.0
MentalHealthProblem,0.48725,0.0,0.0,1.0
DiagnosisAnxiety,0.13375,0.0,0.0,1.0
DiagnosisDepressionDysthymia,0.36225,0.0,0.0,1.0
DiagnosisBipolar,0.0655,0.0,0.0,1.0


### Model

We will use the relatively lightweight Mistral-7B-Instruct-v0.2 model LLM for our solution.

In [None]:
MODEL_DIR = Path("assets")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
def save_model(device, model_name="mistralai/Mistral-7B-Instruct-v0.2"):
    logger.info(f"Using device {device} to save model to {MODEL_DIR}")

    # use 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
    )

    logger.info("Downloading model")
    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=quantization_config, device_map=device
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    logger.info(f"Saving model to {MODEL_DIR}")
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    logger.success("Model and tokenizer saved")

In [None]:
if not (MODEL_DIR / "config.json").exists():
    logger.info("Downloading model")
    save_model(DEVICE)
else:
    logger.info("Using existing local model")

In [None]:
logger.info(f"Loading model from {MODEL_DIR}, {MODEL_DIR.exists()}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR, device_map=DEVICE, local_files_only=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)