In [None]:
import os
from tqdm import tqdm
import pandas as pd
import csv
import json
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = os.path.join(os.getcwd(),"drive","MyDrive","LLM project","DATA","task-a-en.tsv")
print(data_path)

/content/drive/MyDrive/LLM project/DATA/task-a-en.tsv


In [None]:
# 1. Install necessary libraries
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers --index-url https://download.pytorch.org/whl/cu121
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes


# LOADING THE MODEL
from unsloth import FastLanguageModel
import torch
import os

max_seq_length = 2048
dtype = None
load_in_4bit = True

# Step 1: Load the BASE model first
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",  # üëà Original base model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Step 2: Load your trained LoRA adapter on top
from peft import PeftModel
model = PeftModel.from_pretrained(
    model,
    "./drive/MyDrive/LLM project/DATA/...checkpoint here"
)

# Step 3: Enable inference mode
FastLanguageModel.for_inference(model)
print("full model is set to generate jokes")


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-r6xwuwc6/unsloth_a33a139ff7da42d19f0a0c0b48bed54a
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-r6xwuwc6/unsloth_a33a139ff7da42d19f0a0c0b48bed54a
  Resolved https://github.com/unslothai/unsloth.git to commit 4cb7229ac1c346e143524b6f9a6ad544259364d6
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting trl!=0.19.0,<=0.24.0,>=0.18.2 (from unsloth_zoo>=2026.1.4->unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Using cached trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.24.0-py3-none-any.whl (423 kB)
Inst

In [None]:
def create_headline_prompt(headline_text):
    prompt_text = f"""### Instruction
You are a witty, cynical stand-up comedian.
Your task is to write EXACTLY ONE punchy joke (1‚Äì2 sentences total) based on the provided headline.

### Examples
Here is how to turn a headline into a standalone joke (weaving the context into the setup):

Headline: "Study finds 90% of office meetings could be emails."
Joke: "A new study found that 90% of office meetings could be emails, which implies the other 10% could have just been silence."

Headline: "Billionaire builds giant clock inside a mountain."
Joke: "A billionaire is building a giant clock inside a mountain, finally providing a way to tell time for the five people who actually survive the apocalypse."

Headline: "Scientists discover new species of deep-sea jelly."
Joke: "Scientists have discovered a new species of jelly at the bottom of the ocean, mostly because they were tired of looking for the ones in their donuts."

### Task
Target Headline: "{headline_text}"

### Constraints
1. The joke must be **STANDALONE**. Do not assume the audience has read the headline; include the premise in the joke itself.
2. Be clever, cynical, or ironic (not moralizing).
3. **NO** explanations or conversational filler (e.g., do not write "Here is the joke").
4. Output **ONLY** the joke.

### Response
"""

    return prompt_text

In [None]:
def create_words_prompt(word1, word2):
    prompt_text = f"""You are a witty, cynical stand-up comedian.

Task: Write EXACTLY ONE punchy joke (1‚Äì2 short sentences) that connects the following two concepts: "{word1}" and "{word2}".

Here are examples of how to connect random words creatively:

Example 1 (Metaphor/Analogy):
Words: "unplug" + "fridge"
Joke: "My current relationship is exactly like an unplugged fridge: it's cold, dark, and I'm terrified to open it and see what's rotting inside."

Example 2 (Ironic Failure):
Words: "hammer" + "banana"
Joke: "I tried to fix my diet with the same tool I use to fix my furniture, but it turns out taking a hammer to a banana just makes a smoothie with too much crunch."

Example 3 (Cynical Observation):
Words: "measure" + "pizza"
Joke: "Trying to measure happiness with money is like trying to measure a pizza with a thermometer: you're using the wrong tool and you're just going to burn your hand."

MANDATORY Rules:
- You can use the words literally OR metaphorically.
- The logic must hold up (e.g., do not say a laptop cooks food).
- The joke should not reuse the structure or wording of the examples.
- Do NOT explain the joke.
- Do NOT use filler like "Here is a joke."

Words to connect: "{word1}" and "{word2}"
Joke:"""

    return prompt_text

In [None]:
def append_entry_jsonl(
    jsonl_path: str,
    entry_id: str,
    entry_type: str,
    input_original: str,
    generated_joke: str
):
    record = {
        "id": entry_id,
        "type": entry_type,
        "input_original": input_original,
        "generated_joke": generated_joke
    }

    # Append one JSON object per line
    with open(jsonl_path, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

In [None]:
def format_output(text: str):
    # This pattern looks for "Reasoning:" followed by content, then "Joke:" followed by content
    pattern = r"(?i)reasoning:\s*(.*?)\s*joke:\s*(.*)"
    match = re.search(pattern, text, re.DOTALL)

    if match:
        # If the format exists, extract both parts
        reasoning = match.group(1).strip()
        joke = match.group(2).strip()
    else:
        # Fallback: If no format is found, the whole text is the joke
        reasoning = None
        joke = text.strip()

    return (reasoning, joke)

In [None]:
def generate_jokes(input_path, output_file="./drive/MyDrive/LLM project/DATA/generated_jokes_llamafinetune_GEMINI2(225).jsonl") :

  df = pd.read_csv(input_path, delimiter="\t")

  processed_ids = set()

  if os.path.isfile(output_file):
    print(f"Found existing file: {output_file}. Checking progress...")
    try:
        if output_file.endswith("jsonl") :
            existing_df = pd.read_json(output_file, lines = True)
            processed_ids = set(existing_df["id"])
        else :
            with open(output_file, "r", encoding="utf-8") as f:
                data = json.load(f)

            # 1. SIMPLE FIX: Just get the keys directly from the dictionary
            # Assuming your JSON structure is { "en_001": {...}, "en_002": {...} }
            # If your keys are wrapped in "ids", use data["ids"].keys()

            # specific check based on your code using 'data["ids"]'
            if "ids" in data:
                processed_ids = set(data["ids"].keys())
            else:
                # Fallback if the JSON is just the records directly (common in some formats)
                processed_ids = set(data.keys())

        print(f"Resuming! {len(processed_ids)} records already finished.")

    except Exception as e:
        print(f"Warning: Could not read existing file ({e}). Starting separate backup.")
        output_file = 'results_incremental_v2.jsonl'
  else:
    print("No existing file found. Starting from scratch.")
    # Create file and write header ONLY if it doesn't exist


  # Loop through every row in your dataframe

  pbar = tqdm(df.iterrows(), total=len(df), desc="Generating jokes")

  for index, row in pbar:

      if str(row["id"]) in processed_ids :
        continue

      if index > 300 :
        return

      # A. LOGIC: Decide which prompt to use
      # We check if headline is valid (not empty and not just a dash)
      if pd.notna(row['headline']) and str(row['headline']).strip() != '-':
          prompt = create_headline_prompt(row['headline'])
      else:
          prompt = create_words_prompt(row['word1'], row['word2'])

      # B. TOKENIZATION: Convert prompt to numbers
      input_ids = tokenizer.apply_chat_template(
          [{"role": "user", "content": prompt}],
          add_generation_prompt=True,
          return_tensors="pt"
      ).to(model.device) # Move data to GPU

      attention_mask = input_ids.ne(tokenizer.pad_token_id).long()

      # C. GENERATION: The model "thinks"
      terminators = [
          tokenizer.eos_token_id,
          tokenizer.convert_tokens_to_ids("<|eot_id|>")
      ]

      outputs = model.generate(
          input_ids,
          attention_mask=attention_mask,
          pad_token_id=tokenizer.eos_token_id,
          max_new_tokens=512,       # Limit output length to save time
          eos_token_id=terminators,
          do_sample=True,          # Adds creativity (False = robotic/deterministic)
          temperature=0.85,         # Controls randomness (0.7 is a good balance)
          top_p=0.88,
          repetition_penalty=1.2,
      )

      # D. DECODING: Extract only the new response
      # We slice [input_ids.shape[-1]:] to remove the prompt from the answer
      response = outputs[0][input_ids.shape[-1]:]
      model_response = tokenizer.decode(response, skip_special_tokens=True)

      reasoning, joke = format_output(model_response)


      if joke is not None :

        entry_type = "headline" if row["headline"] != "-" else "words"
        input_original = row["headline"] if entry_type == "headline"  else f"{row["word1"]}, {row["word2"]}"

        append_entry_jsonl(output_file, row["id"], entry_type, input_original, joke)

      else :
        pbar.write(f"‚ö†Ô∏è[{index}] Failed to generate due to None presence in response")
        continue


In [None]:
generate_jokes(input_path = data_path)

No existing file found. Starting from scratch.


Generating jokes:  25%|‚ñà‚ñà‚ñå       | 301/1200 [12:09<36:19,  2.42s/it]
