In [7]:
#!pip install -U flash-attn --no-build-isolation

!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple/


In [4]:
!pip uninstall -y torch

Found existing installation: torch 2.2.2
Uninstalling torch-2.2.2:
  Successfully uninstalled torch-2.2.2


In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
import pandas as pd
import torch.nn.functional as F
#from google.colab import drive

#drive.mount('/content/drive')

warnings.filterwarnings("ignore")

In [None]:
!pip install transformers

In [6]:
def load_model_and_tokenizer(model_name):
    """
    Load pre-trained language model and tokenizer from Hugging Face.

    Args:
        model_name (str): Name or path of the pre-trained model.

    Returns:
        model (AutoModelForCausalLM): Loaded pre-trained language model.
        tokenizer (AutoTokenizer): Loaded tokenizer for the pre-trained model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) #  attn_implementation="flash_attention_2",

    return model, tokenizer

def mean_pooling(model_output, attention_mask):
    """
    Perform mean pooling on the token embeddings from the language model output.

    Args:
        model_output (dict): Output from the pre-trained language model.
        attention_mask (torch.Tensor): Attention mask for the input sequences.

    Returns:
        torch.Tensor: Mean-pooled sentence embeddings.
    """
    token_embeddings = model_output['hidden_states'][-1]  # First element contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def generate_embeddings(model, tokenizer, data, batch_size):
    """
    Generate sentence embeddings for a given dataset using the pre-trained language model.

    Args:
        model (AutoModelForCausalLM): Loaded pre-trained language model.
        tokenizer (AutoTokenizer): Loaded tokenizer for the pre-trained model.
        data (pd.DataFrame): Dataset containing input sequences and labels.
        batch_size (int): Batch size for processing the data.

    Returns:
        pd.DataFrame: DataFrame containing sentence embeddings and corresponding labels.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    X, y = data.drop(['HOME_W'], axis=1), data[["HOME_W"]]

    all_embeddings = []
    all_labels = []

    for i in range(int(len(X) / batch_size) + 1):
        if i % 50 == 0:
            print(f"Processing batch {i}")

        # Sentences we want sentence embeddings for
        sentences = X.values[i * batch_size:(i + 1) * batch_size].reshape((-1,)).tolist()
        y_true = y.values[i * batch_size: (i + 1) * batch_size].tolist()

        # Tokenize sentences
        try:
            encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
        except Exception as e:
            print(e)
            break

        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input, output_hidden_states=True)

        # Perform pooling
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

        all_embeddings.extend(sentence_embeddings.cpu().numpy())
        all_labels.extend(y_true)

    result = pd.DataFrame(all_embeddings)
    result["HOME_W"] = all_labels

    return result

def generate_text(model, tokenizer, prompt, max_length=100):
    """
    Generate text using the pre-trained language model.

    Args:
        model (AutoModelForCausalLM): Loaded pre-trained language model.
        tokenizer (AutoTokenizer): Loaded tokenizer for the pre-trained model.
        prompt (str): Input prompt for the language model.
        max_length (int): Maximum length of the generated text.

    Returns:
        str: Generated text output from the language model.
    """
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        output = model.generate(**inputs, do_sample=False, max_new_tokens=max_length)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [None]:
!pip install -U flash-attn --no-build-isolation

In [None]:
!nvcc -V

In [7]:
# Set model and access token
#model_name = "meta-llama/Llama-2-7b-hf"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#access_token = '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'
model, tokenizer = load_model_and_tokenizer(model_name)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Generate embeddings
batch_size = 5
for j in range(1, 6):
    path = f'betOnMeLLMDataset_L{j}.csv'
    print(f"Processing file: {path}")
    df = pd.read_csv(path, index_col=0)
    # Use INST tag as used in training by Mistral
    df[f'DESCRIPTION_L{j}'] = "[INST]" + df[f'DESCRIPTION_L{j}'] + "[\INST]"

    embeddings = generate_embeddings(model, tokenizer, df, batch_size)

    output_path = f'/home/heminway.r/embeddingsl{j}_llmembeddings.csv'
    embeddings.to_csv(output_path, index=False)
    print(f"Embeddings saved to: {output_path}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing file: betOnMeLLMDataset_L1.csv
Processing batch 0
Processing batch 50
Processing batch 100
Processing batch 150
Processing batch 200
Processing batch 250
Processing batch 300
Processing batch 350
Processing batch 400
Processing batch 450
Processing batch 500
Processing batch 550
Processing batch 600
Processing batch 650
Embeddings saved to: /home/heminway.r/embeddingsl1_llmembeddings.csv
Processing file: betOnMeLLMDataset_L2.csv
Processing batch 0
Processing batch 50
Processing batch 100
Processing batch 150
Processing batch 200
Processing batch 250
Processing batch 300
Processing batch 350
Processing batch 400
Processing batch 450
Processing batch 500
Processing batch 550
Processing batch 600
Processing batch 650


In [None]:
# generating text
path = f'/content/drive/MyDrive/betOnMeLLMDataset_L0.csv'
df = pd.read_csv(path, index_col=0)

prompt = df.iloc[130]["DESCRIPTION_L0"] + " Who do you think won? Why? Think step-by-step."
generated_text = generate_text(model, tokenizer, prompt)
print(f"Prompt: {prompt}")
print(f"Generated Text: {generated_text}")