In [1]:
%%capture
#If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed
!pip install --no-index /kaggle/input/making-wheels-of-necessary-packages-for-hf-llms/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-hf-llms
!pip install --no-index /kaggle/input/making-wheels-of-necessary-packages-for-hf-llms/accelerate-0.27.2-py3-none-any.whl --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-hf-llms
!pip install --no-index /kaggle/input/making-wheels-of-necessary-packages-for-hf-llms/transformers-4.38.1-py3-none-any.whl --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-hf-llms
!pip install --no-index /kaggle/input/making-wheels-of-necessary-packages-for-hf-llms/optimum-1.17.1-py3-none-any.whl --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-hf-llms

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import os
from memotion_utility import load_data
from tqdm import tqdm

In [3]:
CSV_FILE = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/labels.csv'
captions = '/kaggle/input/memotion-with-captions/caption_BLIP.csv'
ROOT_DIR = '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/images'
downsample = False
max_new_token = 512

In [4]:
# MODEL_PATH = "/kaggle/input/gemma/transformers/7b-it/2"
# MODEL_PATH = "/kaggle/input/gemma/transformers/2b-it/2"
# MODEL_PATH = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
# MODEL_PATH = "/kaggle/input/mixtral/pytorch/8x7b-instruct-v0.1-hf/1"
MODEL_PATH = "/kaggle/input/llama-2/pytorch/13b-chat-hf/1"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    trust_remote_code = True,
    quantization_config=quantization_config,
    pretraining_tp=1
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [5]:
# df = pd.read_csv('/kaggle/input/memotion-with-captions/memotion_dataset_with_captions_BLIP.csv')
# def preprocess(df):
#     df = df.drop('Unnamed: 0', axis=1)
#     df = df.sample(frac=1).reset_index(drop=True)
#     df['offensive'] = np.where(df['offensive'] == 'not_offensive', 'not_offensive', 'offensive')

#     df['offensive'] = df['offensive'].map({
#         'not_offensive': 0, 
#         'offensive': 1
#     })
#     return df
# df = preprocess(df)
# df.head()

In [6]:
train_df,val_df,test_df = load_data(CSV_FILE,downsample = downsample,captions = captions)

train : 
 label
1    3079
0    1951
Name: count, dtype: int64
val : 
 label
1    342
0    217
Name: count, dtype: int64
test : 
 label
1    856
0    542
Name: count, dtype: int64


In [7]:
def display(df,index):
    row = df.iloc[index]
    
    # Load the image
    img_name = df.iloc[index]['image_name']
    img_path = os.path.join(ROOT_DIR,img_name)
    img = mpimg.imread(img_path)
    
    # Extract the OCR text and caption
    text_ocr = row['text']
    caption = row['caption']
    offensive = row['label']
    
    # Plot the image
    plt.figure(figsize=(10, 10))
    plt.imshow(img)
    plt.axis('off')
    
    # Display OCR text and caption
    plt.title(f"name:{img_name}", fontsize=12)
    plt.show()

In [8]:
def build_prompt(df,index):
    row = df.iloc[index]
    text_ocr = row['text']
    caption = row['caption']
    offensive = row['label']
    y = "Offensive" if offensive == 1 else "Not Offensive"

    system_prompt = """You have been specially designed to perform abductive reasoning for the harmful meme detection task. Your primary function is that, according to a Harmfulness label about an Image with a text embedded, please provide me a streamlined rationale, without explicitly indicating the label, why it is classified as the given Harmfulness label.
The image and the textual content in the meme are often uncorrelated, but its overall semantics is presented holistically. Thus it is important to note that you are prohibited from relying on your own imagination, as your goal is to provide the most accurate and reliable rationale possible
so that people can infer the harmfulness according to your reasoning about the background context and relationship between the given text and image caption
"""
    user_message = f"Given a Text: {text_ocr}, which is embedded in an Image: {caption}; and a harmfulness label {y}, please give me a streamlined rationale associated with the meme, without explicitly indicating the label, for how it is reasoned as {y}."
    
    prompt = f"""
<s>[INST] <<SYS>>
{ system_prompt }
<</SYS>>

{ user_message } [/INST]
"""
    
    return prompt

In [9]:
#display(train_df,19)

In [10]:
#print(build_prompt(train_df,19))

In [11]:
# Example
# n = 19
# input_text = build_prompt(train_df,n)
# tokenizer.pad_token = tokenizer.eos_token
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')
# outputs = model.generate(input_ids,max_new_tokens=max_new_token)
# predicted_answer = tokenizer.decode(outputs[0],skip_special_tokens=True)
# predicted_answer = predicted_answer[len(input_text)-1:].strip()
# print(predicted_answer)

## Split data in batch to generate in multiple run (because cannot do all in one run)

In [12]:
def generate(df,subset_indices):
    """
    Generate rationales for a subset indices of the data frame
    """
    rationales = []
    for index in tqdm(subset_indices, desc="Generating rationales"):
        input_text = build_prompt(train_df,index)
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')
        outputs = model.generate(input_ids, max_new_tokens=max_new_token)
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the rationale by removing the input text from the generated output
        rationale = generated_text[len(input_text)-1:].strip()
        rationales.append(rationale)
        
    new_df = df.iloc[subset_indices][['image_name']].copy()
    new_df['rationale'] = rationales
        
    return new_df

In [13]:
total_indices = len(train_df)

num_subsets = 10
subset_size = total_indices // num_subsets

indices = np.arange(total_indices)
subsets = [indices[i * subset_size:(i + 1) * subset_size] for i in range(num_subsets)]

# If there are remaining indices, distribute them among the subsets
remainder = total_indices % num_subsets
for i in range(remainder):
    subsets[i] = np.append(subsets[i], indices[num_subsets * subset_size + i])

# Print the subsets
for i, subset in enumerate(subsets):
    print(f"Subset {i + 1}: {len(subset)}")

# Example: Accessing the first subset
first_subset_indices = subsets[0]
first_subset = train_df.iloc[first_subset_indices]

#first_subset

Subset 1: 503
Subset 2: 503
Subset 3: 503
Subset 4: 503
Subset 5: 503
Subset 6: 503
Subset 7: 503
Subset 8: 503
Subset 9: 503
Subset 10: 503


In [14]:
def step(n):

    df_new = generate(train_df, subsets[n])
    #display(df_new)
    df_new.to_csv(f'/kaggle/working/rationales_train_{n}.csv', index=False)
    
#step(9)
    
# df_2 = generate(df_train, subsets[1])
# df_3 = generate(df_train, subsets[2])
# df_4 = generate(df_train, subsets[3])
# df_5 = generate(df_train, subsets[4])
# df_6 = generate(df_train, subsets[5])
# df_7 = generate(df_train, subsets[6])
# df_8 = generate(df_train, subsets[7])
# df_9 = generate(df_train, subsets[8])
# df_10 = generate(df_train, subsets[9])

df_val = generate(val_df,range(len(val_df)))
df_val.to_csv(f'/kaggle/working/rationales_val.csv', index=False)
#df_test = generate(test_df,range(len(test_df)))
#df_new.to_csv(f'/kaggle/working/rationales_test.csv', index=False)

2024-07-21 09:34:36.593246: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 09:34:36.593349: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 09:34:36.713586: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Generating rationales: 100%|██████████| 559/559 [4:33:44<00:00, 29.38s/it]
