In [None]:
!pip install transformers torch datasets tqdm sacrebleu trl peft bitsandbytes fuzzywuzzy

In [1]:
!pwd # shows current directory
!ls  # shows all files in this directory
!nvidia-smi # shows the specs and the current status of the allocated GPU

/content
drive  sample_data
Thu Nov 21 22:46:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   71C    P0              35W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                        

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/frefopt/answer/

/content/drive/MyDrive/frefopt/answer


In [4]:
from default import *
from pathlib import Path
import os, sys
import torch
import json
import gzip
import wandb
from tqdm import tqdm
import random
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
from collections import defaultdict
import logging
from dataclasses import dataclass, field
from typing import Optional
from datasets import Dataset
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, BitsAndBytesConfig, TrainingArguments, pipeline
from trl import ORPOTrainer, ORPOConfig, ScriptArguments, ModelConfig, get_peft_config, setup_chat_format
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset



In [5]:
# Make sure that GPU is available for your notebook.
# Otherwise, you need to update the settungs in Runtime -> Change runtime type -> Hardware accelerator
torch.cuda.is_available()

True

# prefopt: default program

## Run the default solution on small

In [None]:
basemodel = 'Qwen/Qwen2.5-0.5B-Instruct'
device = 'cuda'
inputfile = os.path.join('..','data', 'input', 'dev.txt')
logging.basicConfig(filename='log.txt', filemode='w', level=logging.DEBUG)

Decode the inputfile. **Warning**: This will take some time to run.

In [None]:
%%capture output
decode_all(basemodel, device, inputfile)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
print(output)

{"output": "-2"}
{"output": "am"}
{"output": "Yes"}
{"output": "Based on the provided ingredient lists and their corresponding dishes, I would classify them as follows:\n\n1. Pepperoni - This could potentially fit in either the Pizza or Sandwich category depending on how it's prepared. However, since it doesn't specify a type of pizza or sandwich, we can't definitively say it belongs to one class.\n\n2. Tomato sauce - This typically goes with a sandwich, not a pizza. It's more commonly used in sandwiches than pizzas.\n\n3. Cheese - This is a staple in both sandwiches and pizzas, so it fits well in both categories.\n\n4. Bread dough - This is often used in both sandwiches and"}
{"output": "False"}
{"output": "True"}
{"output": "True"}
{"output": "Based on the content of the review, which highlights positive aspects like timely service and enjoyment of the food while also expressing a desire for more vegan options, the overall sentiment leans towards **POSITIVE**. The reviewer seems sati

In [None]:
Path("output.txt").write_text(output.stdout)

14752

## Evaluate the default output

In [None]:
!python ../output_check.py -t ../data/reference/dev.out -o output.txt

Score=28.0000


Ignore the huggingface warnings.

## Documentation

Write some beautiful documentation of your program here.

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

#### merge our train dataset as one for handiness

In [None]:
def is_gzipped(file_path):
    return file_path.endswith('.gz')

def open_file(file_path, mode='rt'):
    if is_gzipped(file_path):
        return gzip.open(file_path, mode, encoding='utf-8')
    else:
        return open(file_path, mode, encoding='utf-8')

def merge_train_files(train_txt_path, train_out_path, output_path):
    with open_file(train_txt_path, 'rt') as txt_file, \
         open_file(train_out_path, 'rt') as out_file, \
         open(output_path, 'w', encoding='utf-8') as merged_file:

        txt_line_num = 0
        out_line_num = 0

        for txt_line, out_line in zip(txt_file, out_file):
            txt_line_num += 1
            out_line_num += 1

            try:
                txt_json = json.loads(txt_line.strip())
                out_json = json.loads(out_line.strip())

                merged_json = {
                    "prompt": txt_json.get("prompt", ""),
                    "constraints": txt_json.get("constraints", ""),
                    "output": out_json.get("output", "")
                }

                merged_file.write(json.dumps(merged_json) + '\n')

            except json.JSONDecodeError as e:
                print(f"JSON decode error at line {txt_line_num} or {out_line_num}: {e}")
            except Exception as e:
                print(f"Unexpected error at line {txt_line_num} or {out_line_num}: {e}")

    print(f"Merging completed. Merged file saved to {output_path}")


data_dir = '../data'
train_txt = os.path.join(data_dir, 'train.txt')  # or 'train.txt' if not compressed
train_out = os.path.join(data_dir, 'train.out')  # or 'train.out' if not compressed
merged_output = os.path.join(data_dir, 'train_merged.json')

# Call the merge function
merge_train_files(train_txt, train_out, merged_output)


Merging completed. Merged file saved to ../data/train_merged.json


In [None]:
def open_file(file_path, mode='rt'):
    return open(file_path, mode, encoding='utf-8')

def extract_unique_constraints(merged_file_path):
    constraints_set = set()
    constraints_count = defaultdict(int)
    total_lines = 0
    skipped_lines = 0

    with open_file(merged_file_path, 'rt') as file:
        for line_num, line in enumerate(file, 1):
            total_lines += 1
            line = line.strip()
            if not line:
                print(f"Skipping empty line at {line_num}.")
                skipped_lines += 1
                continue
            try:
                data = json.loads(line)
                constraint = data.get("constraints", "").strip()
                if constraint:
                    constraints_set.add(constraint)
                    constraints_count[constraint] += 1
                else:
                    print(f"No constraints found at line {line_num}.")
                    skipped_lines += 1
            except json.JSONDecodeError as e:
                print(f"JSON decode error at line {line_num}: {e}")
                skipped_lines += 1
            except Exception as e:
                print(f"Unexpected error at line {line_num}: {e}")
                skipped_lines += 1

    return constraints_set, constraints_count, total_lines, skipped_lines

# data_dir = '../data'
# merged_file = 'train_merged.json'

# # Construct the full file path
# merged_file_path = Path(data_dir) / merged_file

### Check how many unique constraints and counts in train set

In [None]:
constraints_set, constraints_count, total_lines, skipped_lines = extract_unique_constraints(merged_file_path)

print(f"Total lines processed: {total_lines}")
print(f"Total lines skipped due to errors or missing constraints: {skipped_lines}")
print(f"Total unique constraints found: {len(constraints_set)}\n")


No constraints found at line 829.
No constraints found at line 976.
No constraints found at line 1507.
No constraints found at line 2296.
No constraints found at line 3477.
No constraints found at line 3496.
No constraints found at line 3704.
No constraints found at line 4900.
No constraints found at line 5033.
No constraints found at line 7499.
No constraints found at line 9213.
No constraints found at line 13714.
No constraints found at line 15246.
No constraints found at line 18577.
No constraints found at line 22345.
No constraints found at line 25073.
No constraints found at line 26996.
No constraints found at line 27550.
No constraints found at line 29867.
No constraints found at line 34328.
No constraints found at line 38539.
No constraints found at line 43049.
No constraints found at line 43412.
No constraints found at line 54880.
No constraints found at line 55870.
No constraints found at line 56195.
No constraints found at line 57125.
No constraints found at line 61048.
Total

In [None]:
# Sort constraints by occurrence count in descending order
sorted_constraints = sorted(constraints_count.items(), key=lambda item: item[1], reverse=True)

print("Unique Constraints (Sorted by Frequency)")
for i, (constraint, count) in enumerate(sorted_constraints):
    if i >= 50:
        break  # Stop after printing 500 values
    print(f"{constraint} (Count: {count})")

Unique Constraints (Sorted by Frequency)
None (Count: 14457)
None. (Count: 4678)
The output should be 'True' or 'False'. (Count: 2712)
The output should be 'Yes' or 'No'. (Count: 2102)
The output should be one of the two: 'Yes' or 'No'. (Count: 1121)
The output should be one of the two: 'True' or 'False'. (Count: 909)
The output should be an integer. (Count: 726)
The output should be 'POSITIVE' or 'NEGATIVE'. (Count: 674)
The output should be 0 or 1. (Count: 599)
The output should be 'YES' or 'NO'. (Count: 394)
The output should be one of the three: 'stereotype', 'anti-stereotype' or 'unrelated'. (Count: 387)
The output should be one of the following characters: 'A', 'B, 'C', 'D'. (Count: 318)
The output should be either 'True' or 'False'. (Count: 177)
The output should be 'NEGATIVE'. (Count: 177)
The output should be 'POSITIVE', 'NEGATIVE' or 'NEUTRAL'. (Count: 174)
The output should be one of the following characters: 'A', 'B, 'C', or 'D'. (Count: 158)
The output should be 'A' or 'B'

### check how many unique values of constraints in our dev and test sets

In [None]:
constraints_set, constraints_count, total_lines, skipped_lines = extract_unique_constraints('../data/input/dev.txt')

print(f"Total lines processed: {total_lines}")
print(f"Total lines skipped due to errors or missing constraints: {skipped_lines}")
print(f"Total unique constraints found: {len(constraints_set)}\n")
# Sort constraints by occurrence count in descending order
sorted_constraints = sorted(constraints_count.items(), key=lambda item: item[1], reverse=True)

print("Unique Constraints (Sorted by Frequency)")
for i, (constraint, count) in enumerate(sorted_constraints):
    if i >= 50:
        break  # Stop after printing 500 values
    print(f"{constraint} (Count: {count})")

Total lines processed: 100
Total lines skipped due to errors or missing constraints: 0
Total unique constraints found: 64

Unique Constraints (Sorted by Frequency)
The output should be 'True' or 'False'. (Count: 11)
The output should be 'Yes' or 'No'. (Count: 8)
None (Count: 8)
The output should be one of the two: 'Yes' or 'No'. (Count: 7)
None. (Count: 3)
The output should be 'POSITIVE' or 'NEGATIVE'. (Count: 3)
The output should be one of the two: 'True' or 'False'. (Count: 2)
The output should be one of the following characters: 'A', 'B', 'C'. (Count: 2)
The output should be an integer between -5 and 5. (Count: 1)
The output should be a single word. (Count: 1)
The output should be 'Pizza', 'Hamburger', or 'Sandwich'. (Count: 1)
The output should be one word: either True or False (Count: 1)
The output should be 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'. (Count: 1)
The output should be one of the following labels: 'complaint', 'inconvenience', 'incident'. (Count: 1)
The output should be on

### First, we use train_default.out to compare with the output in train.out with fussywussy, if they are too different, then we add it into our ORPO dataset.

In [None]:
# asking another LM to be our judge to create ORPO dataset is too slow,
# so we want to replace that judge with simpler string comparison methods like fuzzywuzzy
# from https://www.geeksforgeeks.org/fuzzywuzzy-python-library/

# def preprocess_text(text):
#     # Convert to lowercase
#     # text = text.lower()
#     # Remove punctuation
#     text = re.sub(r'[^\w\s]', '', text)
#     # Remove extra whitespace
#     text = ' '.join(text.split())
#     return text

In [None]:
# s1 = "Yes"
# s2 = """To determine if the given list of words forms a palindrome, we can compare each word with its reverse:

# 1. 'b' (reverse of 'b') = 'b'
# 2. 'a' (reverse of 'a') = 'a'
# 3. 'n' (reverse of 'n') = 'n'

# Since all pairs of words have the same reversed version, the entire list reads the same forwards and backwards.

# Output: Yes"""
# s1 = preprocess_text(s1)
# s2 = preprocess_text(s2)
# print(s1, "\n", s2)
# print("FuzzyWuzzy Ratio: ", fuzz.ratio(s1, s2))
# print("FuzzyWuzzy PartialRatio: ", fuzz.partial_ratio(s1, s2))
# print("FuzzyWuzzy TokenSortRatio: ", fuzz.token_sort_ratio(s1, s2))
# print("FuzzyWuzzy TokenSetRatio: ", fuzz.token_set_ratio(s1, s2))
# print("FuzzyWuzzy WRatio: ", fuzz.WRatio(s1, s2),'\n\n')

# query = 'geeks for geeks'
# choices = ['geek for geek', 'geek geek', 'g. for geeks']
# print ("List of ratios: ")
# print (process.extract(query, choices), '\n')
# print ("Best among the above list: ",process.extractOne(query, choices))

In [None]:
# # Paths to your datasets
# data_dir = '../data'  # Update this path if necessary
# merged_file = 'train_merged.json'  # Your original dataset
# default_file = 'train_default.out'  # The new dataset from your professor

# # Construct the full file paths
# merged_file_path = Path(data_dir) / merged_file
# default_file_path = Path(data_dir) / default_file

# # Check if the files exist
# print(merged_file_path)
# print(default_file_path)

../data/train_merged.json
../data/train_default.out


In [None]:
# orpo_dataset_dict = {
#     "prompt": [],
#     "chosen": [],
#     "rejected": []
# }

In [None]:
# similarity_threshold = 50

# extreme_values = ["None", "None.", ": None", "N/A", "Yes", "none", "-", ":None", "No", "yes", "no"]

# # Open both datasets
# with open(merged_file_path, 'rt', encoding='utf-8') as merged_f, \
#      open(default_file_path, 'rt', encoding='utf-8') as default_f:

#     merged_lines = merged_f.readlines()
#     default_lines = default_f.readlines()

#     # Iterate over the datasets
#     for line_num, (merged_line, default_line) in enumerate(tqdm(zip(merged_lines, default_lines), desc="Processing samples", total=len(merged_lines)), 1):
#         try:
#             # Read data from merged dataset
#             merged_data = json.loads(merged_line.strip())
#             prompt = merged_data.get('prompt', '')
#             constraints = merged_data.get('constraints', '')
#             output_merged = merged_data.get('output', '').strip()

#             if constraints.strip() in extreme_values:
#                 continue

#             # Read data from default dataset
#             default_data = json.loads(default_line.strip())
#             output_default = default_data.get('output', '').strip()

#             # Preprocess the outputs
#             output_merged_clean = preprocess_text(output_merged)
#             output_default_clean = preprocess_text(output_default)

#             # Compute similarity
#             similarity_score_WRatio = fuzz.WRatio(output_merged_clean, output_default_clean)
#             similarity_score_TokenSetRatio = fuzz.token_set_ratio(output_merged_clean, output_default_clean)

#             if similarity_score_WRatio < similarity_threshold and similarity_score_TokenSetRatio < similarity_threshold:
#                 # Outputs are significantly different; add to ORPO dataset

#                 prompt_text = prompt + '\n' + constraints
#                 messages = [
#                     {
#                         "role": "system",
#                         "content": "You are a helpful assistant that provides useful answers without too much extra output.",
#                     },
#                     {
#                         "role": "user",
#                         "content": f"{prompt_text}"
#                     },
#                 ]

#                 orpo_dataset_dict['prompt'].append(messages)
#                 orpo_dataset_dict['chosen'].append(output_merged)
#                 orpo_dataset_dict['rejected'].append(output_default)

#         except Exception as e:
#             print(f"Error processing line {line_num}: {e}")
#             continue


Processing samples: 100%|██████████| 68478/68478 [01:30<00:00, 759.59it/s]


In [None]:
# # Print the total number of entries
# total_entries = len(orpo_dataset_dict['prompt'])
# print(f"Total entries in orpo_dataset_dict: {total_entries}")

# # Define the number of random samples you want
# num_samples = 5

# # Adjust the number of samples if there are fewer than 5 entries
# if total_entries < num_samples:
#     num_samples = total_entries
#     print(f"Only {num_samples} entries available. Displaying all available entries.")

# # Select random unique indices without replacement
# random_indices = random.sample(range(total_entries), num_samples)

# # Print the selected random entries
# for i, idx in enumerate(random_indices, 1):
#     print(f"\nEntry {i}:")
#     print(f"Prompt: {orpo_dataset_dict['prompt'][idx]}")
#     print(f"Chosen (Output from merged dataset): {orpo_dataset_dict['chosen'][idx]}")
#     print(f"Rejected (Output from default dataset): {orpo_dataset_dict['rejected'][idx]}")
#     print('-' * 50)


Total entries in orpo_dataset_dict: 15312

Entry 1:
Prompt: [{'role': 'system', 'content': 'You are a helpful assistant that provides useful answers without too much extra output.'}, {'role': 'user', 'content': "In this task, you are given a passage of text with tags separating each sentence. Your job is to return the number of the sentence that contains an error in grammar or spelling. If there are no errors, return 'No Error'.\nNo<sep>error<sep>.\nThe output should be a number or 'No Error'."}]
Chosen (Output from merged dataset): No Error
Rejected (Output from default dataset): 1
--------------------------------------------------

Entry 2:
Prompt: [{'role': 'system', 'content': 'You are a helpful assistant that provides useful answers without too much extra output.'}, {'role': 'user', 'content': "A palindrome is a word that reads the same forward and backward, e.g., madam or level.\nnoon.\nThe output should be 'True' or 'False'."}]
Chosen (Output from merged dataset): True
Rejected 

In [None]:
# # Save to a JSON file
# orpo_output_path = Path(data_dir) / 'orpo_original.json'

# with open(orpo_output_path, 'w', encoding='utf-8') as out_file:
#     json.dump(orpo_dataset_dict, out_file, indent=4)

# print(f"\nORPO dataset saved to {orpo_output_path}")



ORPO dataset saved to ../data/orpo.json


### Final ORPO dataset

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [33]:
# Paths to your datasets
data_dir = '../data'  # Update this path if necessary
merged_file = 'train_merged.json'  # Your original dataset
default_file = 'train_default.out'  # The new dataset from your professor

# Construct the full file paths
merged_file_path = Path(data_dir) / merged_file
default_file_path = Path(data_dir) / default_file

# Check if the files exist
print(merged_file_path)
print(default_file_path)

../data/train_merged.json
../data/train_default.out


In [32]:
"""
orpo dataset:

add a assistant role in the orpo_original dataset with content = [chosen] or [rejected] answers

"""

orpo_dataset_dict = {
    "chosen": [],
    "rejected": []
}

In [34]:
# similarity_threshold = 50

# extreme_values = ["None", "None.", ": None", "N/A", "Yes", "none", "-", ":None", "No", "yes", "no"]

# Open both datasets
with open(merged_file_path, 'rt', encoding='utf-8') as merged_f, \
     open(default_file_path, 'rt', encoding='utf-8') as default_f:

    merged_lines = merged_f.readlines()
    default_lines = default_f.readlines()

    # Iterate over the datasets
    for line_num, (merged_line, default_line) in enumerate(tqdm(zip(merged_lines, default_lines), desc="Processing samples", total=len(merged_lines)), 1):
        try:
            # Read data from merged dataset
            merged_data = json.loads(merged_line.strip())
            prompt = merged_data.get('prompt', '')
            constraints = merged_data.get('constraints', '')
            output_merged = merged_data.get('output', '').strip()

            # if constraints.strip() in extreme_values:
            #     continue

            # Read data from default dataset
            default_data = json.loads(default_line.strip())
            output_default = default_data.get('output', '').strip()

            # Preprocess the outputs
            # not sure if we should pre-process them in the cleanest way
            # chosen = preprocess_text(output_merged)
            # rejected = preprocess_text(output_default)
            chosen = output_merged
            rejected = output_default

            # Compute similarity
            # similarity_score_WRatio = fuzz.WRatio(chosen, rejected)
            # similarity_score_TokenSetRatio = fuzz.token_set_ratio(chosen, rejected)

            # if similarity_score_WRatio < similarity_threshold and similarity_score_TokenSetRatio < similarity_threshold:
                # outputs (chosen and rejected) are significantly different; add to ORPO dataset

            prompt_text = prompt + '\n' + constraints
            chosen_messages = [
                {
                    "role": "system",
                    "content": "You are a helpful assistant that provides useful answers without too much extra output.",
                },
                {
                    "role": "user",
                    "content": f"{prompt_text}"
                },
                {
                    "role": "assistant",
                    "content": f"{chosen}"
                },
            ]

            rejected_messages = [
                {
                    "role": "system",
                    "content": "You are a helpful assistant that provides useful answers without too much extra output.",
                },
                {
                    "role": "user",
                    "content": f"{prompt_text}"
                },
                {
                    "role": "assistant",
                    "content": f"{rejected}"
                },
            ]

            # orpo_dataset_dict['prompt'].append(messages)
            orpo_dataset_dict['chosen'].append(chosen_messages)
            orpo_dataset_dict['rejected'].append(rejected_messages)

        except Exception as e:
            print(f"Error processing line {line_num}: {e}")
            continue


Processing samples: 100%|██████████| 68478/68478 [00:00<00:00, 96025.18it/s]


In [35]:
# Print the total number of entries
total_entries = len(orpo_dataset_dict['chosen'])
print(f"Total entries in orpo_dataset_dict: {total_entries}")

# Define the number of random samples you want
num_samples = 5

# Adjust the number of samples if there are fewer than 5 entries
if total_entries < num_samples:
    num_samples = total_entries
    print(f"Only {num_samples} entries available. Displaying all available entries.")

# Select random unique indices without replacement
random_indices = random.sample(range(total_entries), num_samples)

# Print the selected random entries
for i, idx in enumerate(random_indices, 1):
    print(f"\nEntry {i}:")
    print(f"Chosen (Output from merged dataset): {orpo_dataset_dict['chosen'][idx]}")
    print(f"Rejected (Output from default dataset): {orpo_dataset_dict['rejected'][idx]}")
    print('-' * 50)

Total entries in orpo_dataset_dict: 68478

Entry 1:
Chosen (Output from merged dataset): [{'role': 'system', 'content': 'You are a helpful assistant that provides useful answers without too much extra output.'}, {'role': 'user', 'content': 'In this task, you need to find the length of a string.\n"The quick brown fox jumps over the lazy dog".\nNone'}, {'role': 'assistant', 'content': 'The length of the string is 43.'}]
Rejected (Output from default dataset): [{'role': 'system', 'content': 'You are a helpful assistant that provides useful answers without too much extra output.'}, {'role': 'user', 'content': 'In this task, you need to find the length of a string.\n"The quick brown fox jumps over the lazy dog".\nNone'}, {'role': 'assistant', 'content': '10'}]
--------------------------------------------------

Entry 2:
Chosen (Output from merged dataset): [{'role': 'system', 'content': 'You are a helpful assistant that provides useful answers without too much extra output.'}, {'role': 'use

In [36]:
# Save to a JSON file
orpo_output_path = Path(data_dir) / 'orpo.json'

with open(orpo_output_path, 'w', encoding='utf-8') as out_file:
    json.dump(orpo_dataset_dict, out_file, indent=4)

print(f"\nORPO dataset saved to {orpo_output_path}")



ORPO dataset saved to ../data/orpo.json


### ORPO Dataset contruction idea with the data which doesn't covered in previous step

Generate text with Qwen/Qwen2.5-0.5B-Instruct model for each sample in train_merged.jsonl file,

and then ask model again with the prompt:
'You are a judge that will answer only 'yes' or 'no', by comparing A and B to judge whether A and B are the same or very similar about their meanings
A: [output for the sample]
B: [the generated new answer]
'

to tell whether the output has same meaning as the generated text and then do following pipeline:

1. if the model answered 'yes', then skip this generated text and keep generate the next answer.
2. if the model answered 'no', then add it into our orpo_dataset_dict as rejected answer. And the format for each line should be like this example:
    if the first sample in our data/train_merged.jsonl file is {"prompt": "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].", "constraints": "None.", "output": "world\nHello"}.
    For the first sample, in the orpo_dataset_dict, we should have:
        orpo_dataset_dict = {
            "prompt": [
                "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].
                None",
                "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].
                None",
                "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].
                None",
                "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].
                None",
                "You will be given a series of words. Output these words in reverse order, with each word on its own line.\nWords: ['Hello', 'world'].
                None",
            ],
            "chosen": [
                "world\nHello",
                "world\nHello",
                "world\nHello",
                "world\nHello",
                "world\nHello"
            ],
            "rejected": [
                "rejected answer 1",
                "rejected answer 2",
                "rejected answer 3",
                "rejected answer 4",
                "rejected answer 5"
            ],
        }
3. if the model answered something else other than 'yes' and 'no', then skip this generated text and keep generate the next answer.
4. check if we already have 5 rejected answers, if we already have 5 rejected answers, then go to next sample in our data/train_merged.jsonl file



### But unfortunately, this is way too time-consuming, so we just omit this dataset generation technique for the rest of data and will use what we have from train_default

In [None]:
# model_name = 'Qwen/Qwen2.5-0.5B-Instruct'

# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map='auto',
#     torch_dtype=torch.bfloat16,
#     trust_remote_code=True
# )

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
# model.to(device)

In [None]:
# from transformers import StoppingCriteria, StoppingCriteriaList

# class StopOnTokens(StoppingCriteria):
#     def __init__(self, stop_tokens, tokenizer):
#         self.stop_tokens = stop_tokens
#         self.tokenizer = tokenizer

#     def __call__(self, input_ids, scores, **kwargs):
#         last_token_id = input_ids[0, -1].item()
#         if last_token_id in self.stop_tokens:
#             return True
#         return False

# def generate_answer(prompt, max_new_tokens=128, temperature=0.7, stop_tokens=None):
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

#     # Generate the output with stopping criteria
#     output_ids = model.generate(
#         input_ids,
#         max_new_tokens=max_new_tokens,
#         do_sample=True,
#         temperature=temperature,
#         top_p=0.95,
#         eos_token_id=tokenizer.eos_token_id,
#         pad_token_id=tokenizer.eos_token_id,
#         stopping_criteria=StoppingCriteriaList([StopOnTokens(stop_tokens, tokenizer)]) if stop_tokens else None
#     )
#     # Remove the prompt from the output to get only the generated text
#     generated_tokens = output_ids[0][input_ids.shape[-1]:]
#     new_answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
#     return new_answer.strip()


# def clean_generated_text(generated_text):
#     unwanted_phrases = ["```", "Human:", "Assistant:", "### Response:", "### Instruction:"]
#     for phrase in unwanted_phrases:
#         generated_text = generated_text.replace(phrase, '')
#     return generated_text.strip()

# # Define stop tokens
# stop_tokens = [tokenizer.encode(token, add_special_tokens=False)[0] for token in ['\n', '<|endoftext|>']]

In [None]:
# orpo_dataset_dict = {
#     "prompt": [],
#     "chosen": [],
#     "rejected": []
# }

In [None]:
# # Define the directory and file names
# data_dir = '../data'  # Update this path if necessary
# orpo_file = 'orpo_dataset_from_train_default.json'

# # Construct the full file path
# orpo_file_path = Path(data_dir) / orpo_file

# # Initialize the ORPO dataset dictionary
# if orpo_file_path.exists():
#     with open(orpo_file_path, 'r', encoding='utf-8') as orpo_f:
#         orpo_dataset_dict = json.load(orpo_f)
#     print(f"Loaded existing ORPO dataset with {len(orpo_dataset_dict['prompt'])} entries.")
# else:
#     orpo_dataset_dict = {
#         "prompt": [],
#         "chosen": [],
#         "rejected": []
#     }
#     print("Initialized an empty ORPO dataset.")


Loaded existing ORPO dataset with 24324 entries.


In [None]:
# # Create a set of existing prompts to avoid duplicates
# existing_prompts = set(orpo_dataset_dict['prompt'])
# print(f"Number of unique prompts in ORPO dataset: {len(existing_prompts)}")


Number of unique prompts in ORPO dataset: 24324


In [None]:
# # Paths to your datasets
# merged_file = 'train_merged.jsonl'    # Your original dataset
# default_file = 'train_default.out'    # The new dataset from your professor

# # Construct the full file paths
# merged_file_path = Path(data_dir) / merged_file
# default_file_path = Path(data_dir) / default_file

# # Check if the files exist
# if not merged_file_path.exists():
#     raise FileNotFoundError(f"The file {merged_file_path} does not exist. Please check the path.")
# if not default_file_path.exists():
#     raise FileNotFoundError(f"The file {default_file_path} does not exist. Please check the path.")

# print(f"Train Merged File: {merged_file_path}")
# print(f"Train Default File: {default_file_path}")


Train Merged File: ../data/train_merged.jsonl
Train Default File: ../data/train_default.out


In [None]:
# similarity_threshold = 70

# with open(merged_file_path, 'rt', encoding='utf-8') as f:
#     for line_num, line in enumerate(tqdm(f, desc="Processing samples"), 1):
#         line = line.strip()
#         if not line:
#             continue
#         try:
#             data = json.loads(line)
#             prompt = data.get('prompt', '')

#             if prompt in existing_prompts:
#                 continue

#             constraints = data.get('constraints', '')
#             output = data.get('output', '').strip()

#             combined_prompt = (
#                 f"{prompt}\n"
#                 f"Constraints: {constraints}\n"
#                 "Provide the output as per the instructions above without any additional text."
#             )

#             # Initialize rejected answers list
#             rejected = []

#             max_attempts = 3  # Prevent infinite loops
#             attempts = 0

#             while len(rejected) < 1 and attempts < max_attempts:
#                 attempts += 1

#                 # Generate a new answer
#                 new_answer = generate_answer(combined_prompt)
#                 new_answer = clean_generated_text(new_answer)

#                 # Avoid duplicate answers
#                 if new_answer == output or new_answer in rejected or not new_answer:
#                     continue

#                 # Preprocess the output and the new_answer
#                 output_clean = preprocess_text(output)
#                 new_answer_clean = preprocess_text(new_answer)

#                 # Calculate similarity score using preprocessed texts
#                 similarity_score_WRatio = fuzz.token_set_ratio(output_clean, new_answer_clean)
#                 similarity_score_TokenSetRatio = fuzz.token_set_ratio(output_clean, new_answer_clean)

#                 if similarity_score_WRatio >= similarity_threshold or similarity_score_TokenSetRatio >= similarity_threshold:
#                     # Outputs are similar; skip this candidate
#                     continue
#                 else:
#                     # Outputs are different; add to rejected answers
#                     rejected.append(new_answer)

#             # For each rejected answer, append to orpo_dataset_dict
#             for rej in rejected:
#                 orpo_dataset_dict['prompt'].append(prompt)
#                 orpo_dataset_dict['chosen'].append(output)
#                 orpo_dataset_dict['rejected'].append(rej)

#         except Exception as e:
#             print(f"Error processing line {line_num}: {e}")
#             continue

In [None]:
# # Print the total number of entries
# total_entries = len(orpo_dataset_dict['prompt'])
# print(f"Total entries in orpo_dataset_dict: {total_entries}")

# # Define the number of random samples you want
# num_samples = 5

# # Adjust the number of samples if there are fewer than 5 entries
# if total_entries < num_samples:
#     num_samples = total_entries
#     print(f"Only {num_samples} entries available. Displaying all available entries.")

# # Select random unique indices without replacement
# random_indices = random.sample(range(total_entries), num_samples)

# # Print the selected random entries
# for i, idx in enumerate(random_indices, 1):
#     print(f"\nEntry {i}:")
#     print(f"Prompt: {orpo_dataset_dict['prompt'][idx]}")
#     print(f"Chosen (Output from merged dataset): {orpo_dataset_dict['chosen'][idx]}")
#     print(f"Rejected (Output from default dataset): {orpo_dataset_dict['rejected'][idx]}")
#     print('-' * 50)


In [None]:
# # Save to a JSON file
# orpo_output_path = Path(data_dir) / 'orpo_dataset.json'

# with open(orpo_output_path, 'w', encoding='utf-8') as out_file:
#     json.dump(orpo_dataset_dict, out_file, indent=4)

# print(f"\nORPO dataset saved to {orpo_output_path}")



ORPO dataset saved to ../data/orpo_dataset.json


### Model training

In [None]:
# # Path to your JSON dataset
# json_file_path = '../data/orpo.json'

# # Path to the output TXT file
# txt_file_path = '../data/orpo.txt'

# # Load the JSON data
# with open(json_file_path, 'r', encoding='utf-8') as f:
#     data = json.load(f)

# # Convert the JSON data to the desired TXT format
# txt_content = json.dumps(data, indent=4)  # Indent for better readability

# # Write the TXT content to the output file
# with open(txt_file_path, 'w', encoding='utf-8') as f:
#     f.write(txt_content)

# print(f"Dataset converted and saved to: {txt_file_path}")

Dataset converted and saved to: ../data/orpo_dataset_from_train_default.txt


In [None]:
# def prepare_dataset(dataset_path):
#     # Load the dataset from the provided JSON file
#     with open(dataset_path, 'r', encoding='utf-8') as f:
#         orpo_dataset_dict = json.load(f)
#     dataset = Dataset.from_dict(orpo_dataset_dict)
#     dataset = dataset.shuffle(seed=42)
#     dataset = dataset.train_test_split(test_size=0.05)
#     return dataset

In [6]:
# if torch.cuda.get_device_capability()[0] >= 8:
#     !pip install -qqq flash-attn
#     attn_implementation = "flash_attention_2"
#     torch_dtype = torch.bfloat16
# else:
#     attn_implementation = "eager"
#     torch_dtype = torch.float16

In [7]:
# torch_dtype

torch.bfloat16

In [10]:
# Model
base_model = "Qwen/Qwen2.5-0.5B-Instruct"
# new_model = "qwen2.5-0.5B-Instruct-lora-orpo"
new_model = "qwen2.5-0.5B-Instruct-orpo"

#### LoRA doesn't work well in this hw

In [None]:
# # QLoRA config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch_dtype,
#     bnb_4bit_use_double_quant=True,
# )

# # LoRA config
# peft_config = LoraConfig(
#     task_type="CAUSAL_LM",
#     r=16,
#     lora_alpha=24,
#     lora_dropout=0.05,
#     bias="none",
#     target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
# )

In [11]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(base_model,
                                            #  quantization_config=bnb_config,
                                            #  attn_implementation=attn_implementation,
                                             device_map="auto")

if tokenizer.chat_template is None:
    model, tokenizer = setup_chat_format(model, tokenizer) # Only call setup_chat_format if no template exists
model = prepare_model_for_kbit_training(model)

In [12]:
# Load ORPO dataset from ../data/orpo.json
json_file_path = '../data/orpo.json'

with open(json_file_path, 'r', encoding='utf-8') as f:
    orpo_dataset_dict = json.load(f)

print(orpo_dataset_dict.keys())  # Output: dict_keys(['chosen', 'rejected'])

dataset = Dataset.from_dict(orpo_dataset_dict)
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.05)

dict_keys(['chosen', 'rejected'])


In [13]:
orpo_args = ORPOConfig(
    per_device_train_batch_size=4,
    max_steps=1000,
    learning_rate=8e-5,
    gradient_accumulation_steps=1,
    logging_steps=10,
    eval_steps=500,
    output_dir="./results/",
    optim="rmsprop",
    warmup_steps=150,
    bf16=True,
    logging_first_step=True,
    remove_unused_columns=False,
    report_to="wandb"
)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    # peft_config=peft_config,
)

for param in model.parameters():
    param.requires_grad = True

trainer.train()
trainer.save_model(new_model)



Map:   0%|          | 0/65054 [00:00<?, ? examples/s]

Map:   0%|          | 0/65054 [00:00<?, ? examples/s]

Map:   0%|          | 0/65054 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

Map:   0%|          | 0/3424 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mwenhewangcrane[0m ([33mcmpt732[0m). Use [1m`wandb login --relogin`[0m to force relogin


Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
1,2.8176
10,2.0948
20,1.3323
30,1.7884
40,1.3625
50,1.4051
60,1.7111
70,2.3842
80,1.8676
90,1.631


### Inference and check our model performance on dev set

In [14]:
# new_model = 'qwen2.5-0.5B-Instruct-lora-orpo'
device = 'cuda'
inputfile = os.path.join('..','data', 'input', 'dev.txt')
logging.basicConfig(filename='log.txt', filemode='w', level=logging.DEBUG)

In [15]:
%%capture output
decode_all(new_model, device, inputfile)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [16]:
print(output)

{"output": "-1"}
{"output": "am, am, am, am"}
{"output": "Yes"}
{"output": "Pizza\nHamburger\nSandwich"}
{"output": "True"}
{"output": "True"}
{"output": "True"}
{"output": "NEGATIVE"}
{"output": "complaint"}
{"output": "Yes"}
{"output": "C"}
{"output": "POSITIVE\nNEGATIVE"}
{"output": "1 2 4 3 2 4 1"}
{"output": "not_argumentative"}
{"output": "Yes, the target string can be constructed by concatenating some (or all) of the strings from the list."}
{"output": "Yes"}
{"output": "Yes"}
{"output": "Incorrect"}
{"output": "police"}
{"output": "Yes"}
{"output": "1"}
{"output": "Yes"}
{"output": "Yes, string A is a substring of string B."}
{"output": "False"}
{"output": "Yes"}
{"output": "The opposite of \"clamor\" is \"silent.\""}
{"output": "The word 'cat' is a noun. The word'sky' is an adjective. The word 'blue' is an adjective. The word 'book' is an adjective. The word 'happy' is an adjective."}
{"output": "fact"}
{"output": "true"}
{"output": "Yes"}
{"output": "True"}
{"output": "True"}

In [17]:
Path("output.txt").write_text(output.stdout)

3631

In [18]:
!python ../output_check.py -t ../data/reference/dev.out -o output.txt

Score=55.0000
