# Installs & Imports

In [None]:
!pip install -q accelerate

In [None]:
!pip install --upgrade transformers torch torchvision

In [None]:
import pandas as pd

from transformers import AutoTokenizer, pipeline
import torch

from pprint import pprint
from tqdm.auto import tqdm
from sklearn import metrics

import time

import json

In [None]:
# Logging into my HUGGINGFACE
from huggingface_hub import login
my_token = ''
login(token=my_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading & Preparing the dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving AITA_minorities_subset.csv to AITA_minorities_subset (1).csv


In [None]:
aita_subset = pd.read_csv('AITA_minorities_subset.csv')
aita_subset['body'] = aita_subset['body'].str.replace('\n', ' ', regex=False)
aita_subset = aita_subset[['title', 'body', 'label']]
aita_subset['summary_prompt'] = "Judgement: " + aita_subset['label'] + ", Post: " + aita_subset['body']

test_data =  aita_subset[:100]

print(f'test: {test_data.shape}')

train: (2, 4)
test: (2, 4)


# STEP1: Summarisation and Key points extraction using Llama 3.1 Instruct

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"



In [None]:
summariser = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
def query(pipe, inputs):
  """
  :param pipe: text-generation pipeline
  :param model_folder_path: list of messages
  :return: list
  """
  assistant_outputs = []

  terminators = [
      pipe.tokenizer.eos_token_id,
      pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
  ]


  for out in tqdm(pipe(
      inputs,
      max_new_tokens=256, # setting token lenght to produce short summaries
      eos_token_id=terminators,
      do_sample=True, # enabling various methods of next work sampling
      temperature=0.3, # setting lower temperature as no variation is needed for summary
  )):
    assistant_outputs.append(out[0]["generated_text"][-1]['content'].strip())

  return assistant_outputs

## Creating system message with one-shot prompting

In [None]:
summary_system_message = """
You are a highly intelligent language model designed to analyze posts from the "Am I the Asshole?" (AITA) subreddit. Your task is to read a given AITA post, extract key points relevant to the provided judgment, and use these points to create a concise summary.

Instructions:

Read the Post:
Carefully read the provided AITA post, paying attention to the details and context provided by the poster.

Extract Key Points for Judgment:
Identify and list the key points that are relevant to the given judgment (e.g., actions, intentions, responses, and context).

Summarize the Post:
Using the extracted key points, create a concise summary of the post, ensuring that all important details are included.

Retrun in a json format

Example of Formated for Response:

{
  "Response": {

    "Judgement": "YTA",

    "Key point for judgement" : "Because OP is expecting Anthony to move his picture of him and his wife in his own home, while OP is just a guest there",

    "Summary":"<concise summary of the post based on the acknowledged key point>"
    }
}
"""




In [None]:
def format_chat(row):
  return [
    {"role": "system", "content": summary_system_message},
    {"role": "user", "content": row['summary_prompt']}
  ]

test_data.loc[:, 'get_summary_input'] = test_data.apply(format_chat, axis=1)
pprint(test_data.loc[:1, 'get_summary_input'].tolist(), sort_dicts=False)

[[{'role': 'system',
   'content': '\n'
              'You are a highly intelligent language model designed to analyze '
              'posts from the "Am I the Asshole?" (AITA) subreddit. Your task '
              'is to read a given AITA post, extract key points relevant to '
              'the provided judgment, and use these points to create a concise '
              'summary.\n'
              '\n'
              'Instructions:\n'
              '\n'
              'Read the Post:\n'
              'Carefully read the provided AITA post, paying attention to the '
              'details and context provided by the poster.\n'
              '\n'
              'Extract Key Points for Judgment:\n'
              'Identify and list the key points that are relevant to the given '
              'judgment (e.g., actions, intentions, responses, and context).\n'
              '\n'
              'Summarize the Post:\n'
              'Using the extracted key points, create a concise summary of the '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.loc[:, 'get_summary_input'] = test_data.apply(format_chat, axis=1)


In [None]:
start_time = time.time()
predictions = query(summariser, test_data['get_summary_input'].tolist())
print(f'Time: {int(time.time() - start_time)} seconds')

# print(predictions)
print(*predictions[:2], sep = "\n\n")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


  0%|          | 0/2 [00:00<?, ?it/s]

Time: 324 seconds
{
  "Response": {
    "Judgement": "YTA",

    "Key point for judgement": "Because OP is expecting Anthony to compromise on the framed picture of him and his wife, despite it being a personal item with sentimental value, and Anthony is refusing to move it, citing it's his house and OP is being too pushy",

    "Summary": "OP and his wife, Dahlia, are temporarily living with their friend Anthony due to losing their apartment. Dahlia is uncomfortable with a framed picture of Anthony and his wife kissing in the hallway, which she feels is intrusive. OP asks Anthony to move the picture, but Anthony refuses, citing it's his house and OP is being too pushy. The situation escalates, with Dahlia refusing to come out of her room until the picture is moved, and Anthony refusing to compromise. OP feels stuck between his friend and his wife, and wonders if he's being unreasonable in expecting Anthony to accommodate Dahlia's discomfort."
  }
}

{
  "Response": {

    "Judgement": 

In [None]:
summaries = [json.loads(pred)['Response']['Summary'] for pred in predictions]
test_data['generated_summary'] = summaries


In [None]:
summaries

["OP and his wife, Dahlia, are temporarily living with their friend Anthony due to losing their apartment. Dahlia is uncomfortable with a framed picture of Anthony and his wife kissing in the hallway, which she feels is intrusive. OP asks Anthony to move the picture, but Anthony refuses, citing it's his house and OP is being too pushy. The situation escalates, with Dahlia refusing to come out of her room until the picture is moved, and Anthony refusing to compromise. OP feels stuck between his friend and his wife, and wonders if he's being unreasonable in expecting Anthony to accommodate Dahlia's discomfort.",
 "A 29-year-old pregnant woman is unable to attend her husband's step-brother's funeral due to her aversion to traveling, particularly flying. She suggests that her husband video calls with his siblings instead, but he feels this is unsupportive and wants to attend the funeral in person. The OP is prioritizing her own comfort and health over her husband's need to be present at th

In [None]:
test_data.to_csv('aita_subset_summaries.csv', index=False)