# Familiarity Rating Generation

##### **【⚠️Note】** APIs and models evolve rapidly. This code reflects the API methods available at the time of the study. 
##### Please consult official documentation for the latest API protocols and parameter settings before extending this work.

### GPT-4o

In [None]:
# Imports & Configuration
import random
import pickle
import sys
from openai import OpenAI

In [None]:
# API Configuration
# Replace 'sk-xxxxxx' with your actual OpenAI API key
API_KEY = "sk-xxxxxx" 
MODEL_NAME = "gpt-4o-2024-08-06"

In [None]:
# File Paths
INPUT_FILE = "./target_words.txt"      # Path to the input text file containing cues
OUTPUT_PKL = "GPT_familiar_results.pkl"  # Path to save results as a pickle file
OUTPUT_LOG = "GPT_familiar_results.txt"  # Path to save results as a text log

In [None]:
# GPT Generation Parameters
GPT_PARAMS = {
    "temperature": 0,       # 0-2
    "max_tokens": 10,       # Max tokens for response
    "top_logprobs": 7,      # Return top 7 log probabilities for research analysis
    "logprobs": True        # Explicitly enable logprobs return
}

In [None]:
# Experiment Settings
REPEAT_TIMES = 1        # Number of times to repeat the entire cue list
MAX_REQUESTS = 780000   # Safety limit for total API requests

In [None]:
# Initialize OpenAI Client
client = OpenAI(api_key=API_KEY)

In [None]:
# Input the cue words
def load_cues(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        # Read lines, strip whitespace, and keep only non-empty lines
        cues = [line.strip() for line in file if line.strip()]
    print(f"Successfully loaded {len(cues)} unique cues.")
    return cues

CUES = load_cues(INPUT_FILE)
print(f"Preview: {CUES[:5]}")

In [None]:
# Randomize items in the CUES
def randomize_cues(cues, repeat=1):
    final_sequence = []
    for i in range(repeat):
        batch = cues.copy()
        random.shuffle(batch)
        final_sequence.extend(batch)
    print(f"Randomization complete. Total trials generated: {len(final_sequence)}")
    return final_sequence

CUES_ALL = randomize_cues(CUES, repeat=REPEAT_TIMES)
print(f"Final Sequence Preview: {CUES_ALL[:5]}")

In [None]:
# GPT Interaction Function

def get_gpt_rating(cue, client, model, params):
    prompt_text = (
        "Complete the following tasks as a native speaker of Simplified Chinese: "
        "Familiarity is a measure of how familiar something is. "
        "A Chinese word is very FAMILIAR if you see/hear it often and it is easily recognisable. "
        "In contrast, a Chinese word is very UNFAMILIAR if you rarely see/hear it and it is relatively unrecognisable. "
        "Please indicate how familiar you think each Chinese word is on a scale from 1 (VERY UNFAMILIAR) to 7 (VERY FAMILIAR), "
        "with the midpoint representing moderate familiarity. "
        f"The Chinese word is: {cue} " 
        " Only answer a number from 1 to 7. Please limit your answer to numbers."
    )

    messages = [{"role": "user", "content": prompt_text}]

    try:
        # Call API 
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=params["temperature"],   
            max_tokens=params["max_tokens"],     
            top_logprobs=params["top_logprobs"], 
            logprobs=params["logprobs"]         
        )
        
        content = response.choices[0].message.content.strip()
        logprobs_data = response.choices[0].logprobs.content
        
        logprobs_formatted = ""
        if logprobs_data:
            for token_info in logprobs_data:
                entry = ", ".join([f"'{t.token}': {t.logprob}" for t in token_info.top_logprobs])
                logprobs_formatted += f"{entry}\n"

        return {
            "cue": cue,
            "rating": content,
            "raw_logprobs": logprobs_data,     
            "formatted_logprobs": logprobs_formatted 
        }

    except Exception as e:
        print(f"⚠️ Error processing cue '{cue}': {e}")
        return None

In [None]:
print(f"Starting experiment on {len(CUES_ALL)} items... ")
print(f"Model: {MODEL_NAME}")

In [None]:
results_buffer = []  

# Loop through all cues 
for index, cue in enumerate(CUES_ALL):
    
    # Safety break 
    if index >= MAX_REQUESTS:
        print("Reached maximum request limit. ")
        break
        
    result = get_gpt_rating(
        cue=cue, 
        client=client, 
        model=MODEL_NAME, 
        params=GPT_PARAMS 
    )
    
    if result:
        results_buffer.append(result)
        
        # Write to Text Log 
        with open(OUTPUT_LOG, "a", encoding="utf-8") as f:
            f.write(f"{result['cue']}，，，\n")
            f.write(f"{result['rating']}。。。\n")
            f.write(result['formatted_logprobs'])
            f.write("\n") 
            
        print(f"[{index+1}/{len(CUES_ALL)}] {cue} -> {result['rating']}")
        
    # Save Checkpoint (Every 10 items) 
    if (index + 1) % 10 == 0:
        with open(OUTPUT_PKL, "wb") as f:
            pickle.dump(results_buffer, f)
        print(" Checkpoint saved.")

# Final Save 
with open(OUTPUT_PKL, "wb") as f:
    pickle.dump(results_buffer, f)

print("\n Experiment finished! All data saved.")

### Qwen-max

##### **【⚠️Note】**  At the time of this study, the official Qwen model API did not provide log probability parameters. 
##### Therefore, we queried the model 30 times for each word and calculated the average to obtain the rating for each word. 
##### Currently, the Qwen model supports log probability parameters; please refer to the official documentation for details.

In [None]:
# Imports & Configuration
import random
import pickle
import time
import sys
from openai import OpenAI

In [None]:
# API Configuration
API_KEY = "sk-xxxxxx" # Replace with your actual Qwen API Key
BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
MODEL_NAME = "qwen-max"

In [None]:
# File Paths
INPUT_FILE = "./target_words.txt"        # Path to the input text file containing cues
OUTPUT_PKL = "Qwen_familiar_results.pkl" # Path to save results as a pickle file
OUTPUT_LOG = "Qwen_familiar_results.txt" # Path to save results as a text log

In [None]:
# Generation Parameters 
GEN_PARAMS = {
    "temperature": 0.7,     
    "max_tokens": 100,      # Max tokens
}

In [None]:
# Experiment Control
REPEAT_TIMES = 30       
MAX_REQUESTS = 1000000

In [None]:
# Initialize Client
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
print(f"✅ Configuration loaded. Plan: Repeat {REPEAT_TIMES} times per cue.")

In [None]:
# Input the cue words
def load_cues(filepath):
    "
    with open(filepath, "r", encoding="utf-8") as file:
        # Read lines, strip whitespace, and keep only non-empty lines
        cues = [line.strip() for line in file if line.strip()]
        
    print(f"Successfully loaded {len(cues)} unique cues.")
    return cues

CUES = load_cues(INPUT_FILE)
print(f"Preview: {CUES[:5]}")

In [None]:
## Randomize items in the CUES
def randomize_cues(cues, repeat=1):
    final_sequence = []
    for i in range(repeat):
        batch = cues.copy()
        random.shuffle(batch)
        final_sequence.extend(batch)
        
        # Print progress every 5 repeats to show it's working 
        if (i+1) % 5 == 0:
            print(f"  -> {i+1} / {repeat} batches added.")
        
    print(f"Randomization complete. Total trials generated: {len(final_sequence)}")
    return final_sequence


CUES_ALL = randomize_cues(CUES, repeat=REPEAT_TIMES)
print(f"Final Sequence Preview: {CUES_ALL[:5]}")

In [None]:
# Qwen Interaction Function

def get_qwen_rating(cue, client, model, params):
    
    prompt_text = (
        "作为一个简体中文母语者完成以下任务：熟悉度是衡量某个东西对你来说有多熟悉的测量标准。"
        "如果一个中文词或者汉字是你经常看到或听到的，并且很容易认出来，那么它就是非常熟悉的。"
        "相反，如果一个中文词或者汉字是你很少看到或听到的，并且不太容易认出来，那么它就是非常不熟悉的。"
        "请在1（非常不熟悉）到7（非常熟悉）的范围内，评估每个中文词或者汉字在你看来有多熟悉，其中的中点代表适中的熟悉度。"
        f"这一个中文词或者汉字是：{cue}"
        "请只回答一个从1到7的数字，并确保答案仅限于数字。"
    )

    messages = [
        {"role": "user", "content": prompt_text}
    ]

    try:
        
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=params["temperature"],   
            max_tokens=params["max_tokens"]
        )
        
        content = response.choices[0].message.content.strip()

        return {
            "cue": cue,
            "rating": content,
            "response_obj": response 
        }

    except Exception as e:
        print(f"⚠️ Error processing cue '{cue}': {e}")
        return None

In [None]:
print(f"Starting experiment on {len(CUES_ALL)} items...")
print(f"Model: {MODEL_NAME} | Repeats: {REPEAT_TIMES}")

In [None]:
results_buffer = []  

# Loop through all cues
for index, cue in enumerate(CUES_ALL):
    
    if index >= MAX_REQUESTS:
        print("Reached maximum request limit.")
        break
        
    result = get_qwen_rating(
        cue=cue, 
        client=client, 
        model=MODEL_NAME, 
        params=GEN_PARAMS 
    )
    
    if result:
        results_buffer.append(result)
        
        with open(OUTPUT_LOG, "a", encoding="utf-8") as f:
            f.write(f"{result['cue']}，，，\n")
            f.write(f"{result['rating']}。。。\n")
            f.write("\n") 
            
        print(f"[{index+1}/{len(CUES_ALL)}] {cue} -> {result['rating']}")
        
    if (index + 1) % 20 == 0:
        with open(OUTPUT_PKL, "wb") as f:
            pickle.dump(results_buffer, f)
        print(" Checkpoint saved.")
    
    # Sleep 0.3s to respect Qwen API limits 
    time.sleep(0.3) 

with open(OUTPUT_PKL, "wb") as f:
    pickle.dump(results_buffer, f)

print("\n Qwen Experiment finished! All data saved.")