# Installs & Imports

In [None]:
!pip install -q accelerate

In [None]:
!pip install --upgrade transformers torch torchvision

Collecting transformers
  Downloading transformers-4.44.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting triton==3.0.0 (from torch)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading transformers-4.44.1-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m88.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl (797.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd

from transformers import AutoTokenizer, pipeline
import torch

from pprint import pprint
from tqdm.auto import tqdm
from sklearn import metrics

import time

import json

In [None]:
from huggingface_hub import login

hf_token = ''
login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading the dataset

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
aita_subset = pd.read_csv('AITA_minorities_subset.csv')

train_data = aita_subset[101:200]
test_data = pd.read_csv('aita_subset_titles.csv')


# STEP1: Summarisation and Key points extraction using Llama 3.1 Instruct

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"



In [None]:
post_generator_llama = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

# STEP 3 Post Generation

In [None]:
# this length is selected to accomodate the average length aita post
post_max_tokens = 768

In [None]:
post_gen_system_message = f"""
You are a highly intelligent language model designed to generate posts for the "Am I the Asshole?" (AITA) subreddit. Your task is to read the title of an AITA post and generate an engaging post that aligns with the provided judgement.

Instructions:

1. Read the Title:
Carefully read the title of the AITA post.

2. Acknowledge the Required Judgement:
Consider the judgement indicated (NTA - Not the Asshole, NAH - No Assholes Here, ESH - Everyone Sucks Here). Think about the key points you need to make to ensure the story aligns with the required judgement.

3. Generate a Story by Following those Steps:

Create an AITA post without using section titles. Include the following elements seamlessly in the narrative:

Brief Background: Provide context for the story. Introduce the main characters and their relationships, and set the scene for the events that will unfold.

The Incident: Describe the specific incident or series of events that led to the conflict. Be detailed and clear about what happened, who was involved, and what actions were taken.

Friends' or Family's Opinion: Describe the opinions of friends or family members about the situation. Include differing viewpoints to provide a balanced perspective on the conflict.

The Current Outcome: Detail the consequences of the incident. Explain how the people involved reacted, any changes in relationships, and any ongoing impact the conflict has had.

Conclusion:
Summarize the key points of the story and pose the question to the readers: "Am I the Asshole for [OP ACTIONS]?"

4. Maintain Authenticity:
Ensure that the story feels realistic and relatable. Use natural language and tone as if a real person is sharing their experience.

5. Adhere to the Judgement:
Ensure that the generated story logically leads to the required judgment (e.g., if the judgment is NTA, the story should clearly indicate why the poster might be considered not the asshole).

"""

In [None]:
# restricting length to avoid using lengthy posts into the prompt hence to many tokens
# but also avoiding passing too short example to not prompt the model for short story generation
percentile_75 = int(train_data['word_count'].quantile(0.75))
percentile_25 = int(train_data['word_count'].quantile(0.25))
train_data= train_data[train_data['word_count'] < percentile_75]
train_data= train_data[train_data['word_count'] > percentile_25]
train_data = train_data.reset_index(drop=True)

In [None]:
train_data['few_shot_input'] = 'Judgement: ' + train_data['label'] + ', Title: ' + train_data['title']
train_data['few_shot_output'] = train_data['body']

In [None]:
def create_example(row):
  one_shot_data = row
  one_shot = []
  for os_index, os_row in one_shot_data.iterrows():
    one_shot.append({"role": "user", "content": os_row['few_shot_input']})
    one_shot.append({"role": "assistant", "content": os_row['few_shot_output']})

  return one_shot

In [None]:
test_data['gen_post_prompt'] = "Judgement: " + test_data['label'] + ", Title: " + test_data['generated_titles']

In [None]:
# this function will sample a random story with the desired title to use as one shot, to avoid skewing all the sotries into the same direction
def format_post_gen_input(row):
  if row['label'] == 'YTA':
    one_shot = create_example(train_data[train_data['label'] == 'YTA'].sample(1))
  elif row['label'] == 'NAH':
    one_shot = create_example(train_data[train_data['label'] == 'NAH'].sample(1))
  elif row['label'] == 'ESH':
    one_shot = create_example(train_data[train_data['label'] == 'ESH'].sample(1))

  system_message = [{"role": "system", "content": post_gen_system_message}]
  user_message = [{"role": "user", "content": row['gen_post_prompt']}]
  return system_message + one_shot + user_message


test_data.loc[:, 'get_post_input'] = test_data.apply(format_post_gen_input, axis=1)
#pprint(test_data.loc[:1, 'get_post_input'].tolist(), sort_dicts=False)

## Clearing GPU memory

In [None]:
import torch
import gc



# Run garbage collection
gc.collect()

# Clear CUDA memory
torch.cuda.empty_cache()


## Generating post with Llama 3.1 Instruct


In [None]:
def generate_post(pipe, inputs):
  """
  :param pipe: text-generation pipeline
  :param model_folder_path: list of messages
  :return: list
  """
  assistant_outputs = []

  terminators = [
      pipe.tokenizer.eos_token_id,
      pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
  ]


  for out in tqdm(pipe(
      inputs,
      max_new_tokens=post_max_tokens,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.8
  )):
    assistant_outputs.append(out[0]["generated_text"][-1]['content'].strip())

  return assistant_outputs

In [None]:


start_time = time.time()
llama_gen_post = generate_post(post_generator_llama, test_data['get_post_input'].tolist())
print(f'Time: {int(time.time() - start_time)} seconds')

print(*llama_gen_post[:2], sep = "\n\n")


In [None]:
print(*llama_gen_post[:1], sep = "\n\n")

In [None]:
test_data['llama_gen_post'] = llama_gen_post

In [None]:
test_data.to_csv('aita_subset_llama_posts.csv', index=False)

In [None]:
from google.colab import files
files.download("aita_subset_llama_posts.csv")