## Dataset Loading

In this step, we load and get an overview of the dataset.

In [16]:
from datasets import load_dataset

ds = load_dataset("hendrydong/preference_700K")

In [18]:
# show an example of the dataset
ds['train']['rejected'][0]

[{'content': 'Is having your medical records online safe?', 'role': 'user'},
 {'content': 'You mean being able to share them with your doctor, or making them public to the internet in general?',
  'role': 'assistant'}]

## Pattern Setup

In this step, we set up the patterns and define experiment metrics.

In [19]:
import re

format_compile_list = {
    "bold": r'\*\*(.*?)\*\*',              # Matches bold text format
    # Matches uppercase words (3 or more letters)
    # "uppercase": r'\b[A-Z]{3,}\b',
    "list": r'(?m)^\d+\.\s|^[*+-]\s',          # Matches list item format
    "exclamation": r'!',                 # Matches exclamation marks
    "link": r'http[^\)]*',  # Matches link format
    "emoji": re.compile(
        # r"\s*"  # Preceding spaces
        r"([\U0001F600-\U0001F64F]"  # Emoticons
        r"|[\U0001F300-\U0001F5FF]"  # Miscellaneous Symbols and Pictographs
        r"|[\U0001F680-\U0001F6FF]"  # Transport and Map Symbols
        r"|[\U0001F1E0-\U0001F1FF]"  # Flags (iOS)
        r"|[\U00002700-\U000027BF]"  # Dingbats
        r"|[\U0001F900-\U0001F9FF]"  # Supplemental Symbols and Pictographs
        r"|[\U0001FA70-\U0001FAFF]"  # Symbols and Pictographs Extended-A
        r"|[\U00002600-\U000026FF]"  # Miscellaneous Symbols
        r")",
        re.UNICODE
    ),
    # Capitalizations
    "capitalization": r'\b[A-Z]+\b', # fully capitalized words
    "quotes": r'[\"]'
}

def has_pattern(response, augment_type=None):
    try:
        if augment_type is None:
            for pattern in list(format_compile_list.values()):
                if re.search(pattern, response) is not None:
                    return True
            if response.startswith("Sure") or response.startswith("Certainly") or response.startswith("Of course"):
                return True

            return False

        if augment_type in list(format_compile_list.keys()):
            if re.search(format_compile_list[augment_type], response) is None:
                return False
        elif augment_type == "affirmative":
            return response.startswith("Sure") or response.startswith("Certainly") or response.startswith("Of course")

        return True
    except Exception as e:
        return False

In [20]:
# for each prompt, check whether each pattern contained in the chosen and rejected responses
from tqdm import tqdm
def count_pairwise_patterns(patterns):
    print(f"Total number of samples: {len(ds['train'])}")
    pattern_cnts = {pattern: [] for pattern in patterns}
    for d in tqdm(ds['train']):
        prefered_response = d['chosen'][1]['content']
        unpreferrable_response = d['rejected'][1]['content']
        for pattern in patterns:
            pattern_cnts[pattern].append(((has_pattern(prefered_response, pattern)), has_pattern(unpreferrable_response, pattern)))
    return pattern_cnts
    

In [21]:
# get the results of all of the existence patterns
pattern_cnts = count_pairwise_patterns(["bold", "list", "exclamation", "link", "emoji", "affirmative", "capitalization", "quotes"])

Total number of samples: 700000


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 700000/700000 [01:45<00:00, 6619.83it/s]


In [22]:
# based on the results, calculate the statistics
import numpy as np
def get_calculated_stats(pattern):
    print(pattern)
    data = np.array(pattern_cnts[pattern])
    # filter out the ones that both are false
    data = data[np.logical_or(data[:, 0], data[:, 1])]
    preferred_percentage = np.sum(data[:, 0]) / len(data)
    unpreferrable_percentage = np.sum(data[:, 1]) / len(data)
    print(f"Preferred percentage: {preferred_percentage*100:.2f}")
    print(f"Unpreferrable percentage: {unpreferrable_percentage*100:.2f}")
    return (preferred_percentage, unpreferrable_percentage)


## Pattern Verification

In this step, we calculate the statistics of the patterns.

### Existence Patterns

In [23]:
results = []
for pattern in ["bold", "list", "exclamation", "link", "emoji", "affirmative", "capitalization", "quotes"]:
    results.append(get_calculated_stats(pattern))
print(results)
print(','.join([f"{r[0]*100:.2f}" for r in results]))
print(','.join([f"{r[1]*100:.2f}" for r in results]))

bold
Preferred percentage: 65.18
Unpreferrable percentage: 46.91
list
Preferred percentage: 80.87
Unpreferrable percentage: 57.98
exclamation
Preferred percentage: 61.70
Unpreferrable percentage: 64.04
link
Preferred percentage: 65.29
Unpreferrable percentage: 52.75
emoji
Preferred percentage: 47.99
Unpreferrable percentage: 59.33
affirmative
Preferred percentage: 64.56
Unpreferrable percentage: 60.24
capitalization
Preferred percentage: 79.95
Unpreferrable percentage: 78.21
quotes
Preferred percentage: 71.22
Unpreferrable percentage: 61.27
[(np.float64(0.6517821061989817), np.float64(0.46908591287480805)), (np.float64(0.8087160327663425), np.float64(0.5797616008787196)), (np.float64(0.6170355117978532), np.float64(0.640365903598475)), (np.float64(0.6528883832638099), np.float64(0.5275354990940884)), (np.float64(0.4799282659677197), np.float64(0.5933232169954477)), (np.float64(0.6455536155096634), np.float64(0.6023541453428863)), (np.float64(0.7995271754704365), np.float64(0.7820795595

### Numerical Patterns

In [27]:
# punctuation
def punctuation_percentage(text):
    punctuation_pattern = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~]'
    punctuation_count = len(re.findall(punctuation_pattern, text))
    words = text.split()
    word_count = len(words)
    return punctuation_count/word_count if word_count > 0 else 0

In [None]:
print(f"Total number of samples: {len(ds['train'])}")
print("punctuation_percentage")
import numpy as np
punctuation_scores = {}
for preference in ['rejected', 'chosen']:
    print(preference)
    scores = []
    for d in tqdm(ds['train'][preference]):
        response = d[1]['content']
        assert d[1]['role'] == 'assistant'
        pp = punctuation_percentage(response)
        scores.append(pp)
    punctuation_scores[preference] = scores

Total number of samples: 700000
punctuation_percentage
rejected


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 700000/700000 [00:10<00:00, 66229.59it/s]


chosen


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 700000/700000 [00:12<00:00, 55080.64it/s]


In [35]:
preferred_scores = np.array(punctuation_scores['chosen']) *100
unpreferred_scores = np.array(punctuation_scores['rejected']) *100

print(f"Preferred: {preferred_scores.mean()} ({preferred_scores.std():.2f})")
print(f"Unpreferred: {unpreferred_scores.mean()} ({unpreferred_scores.std():.2f})")

Preferred: 28.10221699982017 (49.24)
Unpreferred: 28.414209369519234 (123.44)


In [14]:
# Repetition

from collections import Counter
import nltk

# Download the NLTK package for tokenization (if not installed)
nltk.download('punkt')

def calculate_repetition_rate(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text.lower())  # Lowercasing to ensure case insensitivity
    repetition_rate_ngrams = {}
    for n in range(1, 6):
        # Generate n-grams from the tokenized words
        ngrams = list(nltk.ngrams(words, n))

        # Count the frequency of each n-gram
        ngram_counts = Counter(ngrams)

        # Find the number of repeated n-grams
        repeated_ngrams = sum(1 for count in ngram_counts.values() if count > 1)

        # Calculate total number of n-grams
        total_ngrams = len(ngrams)


        # Calculate repetition rate
        repetition_rate = repeated_ngrams / total_ngrams if total_ngrams else 0
        repetition_rate_ngrams[n] = repetition_rate
    return repetition_rate_ngrams


[nltk_data] Downloading package punkt to /home/qiusiz2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
print(f"Total number of samples: {len(ds['train'])}")
print("repetition")
import numpy as np
repetition_scores = {}
for preference in ['rejected', 'chosen']:
    print(preference)
    scores = []
    for d in tqdm(ds['train'][preference]):
        response = d[1]['content']
        assert d[1]['role'] == 'assistant'
        score = calculate_repetition_rate(response)
        scores.append(score)
    repetition_scores[preference] = scores

Total number of samples: 700000
repetition
rejected


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 700000/700000 [10:27<00:00, 1115.44it/s]


chosen


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 700000/700000 [12:30<00:00, 933.14it/s]


In [13]:
import numpy as np
for n in range(1, 6):
    # print(repetition_scores['chosen'][0][n])
    # for index, score in enumerate(repetition_scores['chosen']):
    #     print(score)
    #     print(index, score[n])
    preferred_scores = np.array([score[n]*100 for score in repetition_scores['chosen']])
    unpreferred_scores = np.array([score[n]*100 for score in repetition_scores['rejected']])
    print(f"n={n}")
    print(f"Preferred: {preferred_scores.mean()} ({preferred_scores.std():.2f})")
    print(f"Unpreferred: {unpreferred_scores.mean()} ({unpreferred_scores.std():.2f})")
    print(f"difference: {preferred_scores.mean() - unpreferred_scores.mean()}")

n=1
Preferred: 15.190755553980834 (5.75)
Unpreferred: 14.7077576515241 (6.27)
difference: 0.48299790245673435
n=2
Preferred: 8.830318939537486 (7.05)
Unpreferred: 8.203221866504123 (7.31)
difference: 0.6270970730333634
n=3
Preferred: 5.1840335682052965 (5.99)
Unpreferred: 4.9410111919508415 (6.18)
difference: 0.243022376254455
n=4
Preferred: 3.467355996497633 (5.10)
Unpreferred: 3.377669896455763 (5.26)
difference: 0.08968610004186983
n=5
Preferred: 2.502923171625915 (4.40)
Unpreferred: 2.4815712000947743 (4.56)
difference: 0.021351971531140546
