# Familiarity Rating Generation

### GPT-4o

In [None]:
## Packages
import numpy as np
import random
from openai import OpenAI

In [None]:
## Settings
# Experiment Settings
RepeatTime = 1 # number of times each cue is answered by chatGPT
AnswerTime = 780000 # total number of responses to request
WordOnce = 1 # number of cues sent per request

# OpenAI Parameters
MyAPI = "sk-xxxxxx" # OpenAI API key
MyModel = "gpt-4o-2024-08-06" # model to use
MyTemperature = 0 # sampling temperature (0-2); lower -> less diverse outputs
MyMaxTokens = 100 # max tokens per request (includes prompt + response)
MyFreqPenalty = 0 # frequency penalty (0-1); higher -> reduces repetition

client = OpenAI(api_key = MyAPI)

In [None]:
## Input the cue words
with open("./Stimulus_27624.txt", "r", encoding = "utf-8") as file:
    data = file.read()  # read entire file content into a string

    # split by newline and remove empty lines to create the cue list
    CUES = [i for i in data.split("\n") if i != ""] 

print(len(CUES))
print(CUES[0:WordOnce])  # show the first few cues

In [None]:
## Randomize items in the CUES
CUES = [j for i in [random.sample(CUES, len(CUES)) for i in range(RepeatTime)] for j in i] 
# Randomly shuffle CUES RepeatTime times to create a concatenated list repeated RepeatTime times

CUES_all = CUES

print(len(CUES_all))
print(CUES_all[0:3]) # Show the first 3 items of CUES_all

In [None]:
import time, warnings, pickle, csv
from openai import OpenAI

class Gpta:

    def __init__(self, data, log, results):
        self.log = log
        self.results = results
        self.data = data
        self.cue = []
        self.temp = MyTemperature
        self.maxtokens = MyMaxTokens
        self.freqpenalty = MyFreqPenalty
        self.client = OpenAI(api_key=MyAPI)

    def gpt_associations(self, cue, model=MyModel):
        # Build the message list for the API request
        message = [
            {
                "role": "user",
                "content": (
                    "Complete the following tasks as a native speaker of Simplified Chinese: Familiarity is a measure of how familiar something is. "
                    + "A Chinese word is very FAMILIAR if you see/hear it often and it is easily recognisable."
                    + "In contrast, a Chinese word is very UNFAMILIAR if you rarely see/hear it and it is relatively unrecognisable. "
                    + "Please indicate how familiar you think each Chinese word is on a scale from 1 (VERY UNFAMILIAR) to 7 (VERY FAMILIAR), with the midpoint representing moderate familiarity. "
                    + "The Chinese word is: "
                    + cue
                    + " Only answer a number from 1 to 7. Please limit your answer to numbers."
                ),
            }
        ]

        # Send the chat completion request to the OpenAI model
        response = self.client.chat.completions.create(
            model=model,
            messages=message,
            temperature=self.temp,
            max_tokens=self.maxtokens,
            frequency_penalty=self.freqpenalty,
            logprobs=True,  # return log probabilities for output tokens
            top_logprobs=7,  # return top logprobs for each token
            stream=False,
        )

        self.cue.append(cue)

        # Append cue and model response to the log file
        with open(self.log, "a", encoding="utf-8") as file:
            # Write the cue
            file.write(cue + "，，，\n")
            # Write the model's response
            file.write(response.choices[0].message.content + "。。。\n")

            # Extract logprobs information for each token
            logprobs_content = response.choices[0].logprobs.content  # get logprobs content

            # Iterate tokens to format top_logprobs entries
            top_logprobs_str = ""
            for token_info in logprobs_content:
                top_logprobs = token_info.top_logprobs

                # Convert top_logprobs to simple 'token: logprob' string format
                top_logprobs_entry = ", ".join(
                    [f"'{top_prob.token}': {top_prob.logprob}" for top_prob in top_logprobs]
                )
                # Accumulate top_logprobs lines into a single string
                top_logprobs_str += f"{top_logprobs_entry}\n"

            # Write the accumulated top_logprobs to the log file
            file.write(top_logprobs_str)

        # Append the result dict to the results list
        self.results.append(
            {
                "cue": cue,
                "model": model,
                "message": message,
                "response": response,
                "temperature": self.temp,
                "max_tokens": self.maxtokens,
                "frequency_penalty": self.freqpenalty,
                "top_logprobs_details": logprobs_content,
            }
        )

        # Save the current results list to a pickle file
        with open(self.data, "wb") as file:
            pickle.dump(self.results, file)

        print("获取并保存线索成功：" + cue)
        print("chatGPT的回答是：" + response.choices[0].message.content + "\n")

In [None]:
gpta = Gpta(
    data = "GPT_familiar_results_27624_expression_7.pkl",
    log = "GPT_familiar_results_27624_expression_7.txt",
    results=[],
)

# Process unique cues and collect responses
k = 0
for i in CUES_all: # iterate through each cue
    gpta.gpt_associations(i) # call API to get and save response
    k += 1
    if k >= AnswerTime: # stop after AnswerTime responses
        break

### Qwen

In [None]:
import os
import numpy as np
import random
import time, warnings, pickle, csv
from openai import OpenAI

In [None]:
## Settings
# Experiment Settings
RepeatTime = 1 # number of times each cue is answered by Qwen
AnswerTime = 1000000 # total number of responses to request
WordOnce = 1 # number of cues sent per request

# OpenAI Parameters

MyAPI = "sk-xxxxx" # OpenAI API key
MyModel = "qwen-max" # model to use
MyTemperature = 0.7 # sampling temperature (0-2)
MyMaxTokens = 100 # max tokens per request (includes prompt + response)

client = OpenAI(api_key = MyAPI, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1")  # client configured for compatible-mode endpoint

In [None]:
## Input the cue words
with open("./Stimulus_27624.txt", "r", encoding = "utf-8") as file:
    data = file.read()
    CUES = [i for i in data.split("\n") if i != ""] 

print(len(CUES))
print(CUES[0:WordOnce]) # show the count and the first WordOnce items

In [None]:
## Randomize items in the CUES
CUES = [j for i in [random.sample(CUES, len(CUES)) for i in range(RepeatTime)] for j in i] 
CUES_all = CUES

print(len(CUES_all))
print(CUES_all[0:3]) # Print total count and the first 3 items of CUES_all

In [None]:
## Set functions getting access to qwen

class Gpta: 

    def __init__(self, data, log, results): 
        self.log = log
        self.results = results
        self.data = data
        self.cue = [] # list to track processed cues
        self.temp = MyTemperature  # sampling temperature (from notebook settings)
        self.maxtokens = MyMaxTokens  # max tokens per request (from notebook settings)
        self.client = OpenAI(api_key = MyAPI)  # OpenAI client created with API key

    def gpt_associations(self, cue, model = MyModel): 
        # Send a single cue to the model and save the response.
        # Build the user prompt (in Chinese) that asks for a familiarity rating 1-7.
        message = [
            {
                "role": "user", # role: user message to the model
                "content": "作为一个简体中文母语者完成以下任务：熟悉度是衡量某个东西对你来说有多熟悉的测量标准。"
                + "如果一个中文词或者汉字是你经常看到或听到的，并且很容易认出来，那么它就是非常熟悉的。"
                + "相反，如果一个中文词或者汉字是你很少看到或听到的，并且不太容易认出来，那么它就是非常不熟悉的。"
                + "请在1（非常不熟悉）到7（非常熟悉）的范围内，评估每个中文词或者汉字在你看来有多熟悉，其中的中点代表适中的熟悉度。"
                + "这一个中文词或者汉字是："
                + cue
                + "请只回答一个从1到7的数字，并确保答案仅限于数字。", # user prompt content
            }
        ]
        
        # Call the chat completion endpoint
        response = client.chat.completions.create(
            model = model,
            messages = message,
            temperature = self.temp,
            max_tokens = self.maxtokens,
            stream = False,
        )

        self.cue.append(cue) # record the cue

        with open(self.log, "a", encoding = "utf-8") as file: # append cue and model reply to log file
            # Write cue and model text to the log
            file.write(
                cue
                + "，，，\n"
                + response.choices[0].message.content
                + "。。。"
                + "\n"
            )

        # Append a result dictionary to the in-memory results list
        self.results.append(
            {
                "cue": cue,
                "model": model,
                "message": message,
                "response": response,
                "temperature": self.temp,
                "max_tokens": self.maxtokens,
            }
        )

        # Persist results list to a pickle file
        with open(self.data, "wb") as file:
            pickle.dump(self.results, file)

        # Print confirmations
        print("获取并保存线索成功：" + cue)
        print("qwen的回答是：" + response.choices[0].message.content + "\n")

In [None]:
gpta = Gpta(
    data = "qwen_familiar_results.pkl",
    log = "qwen_familiar_results.txt",
    results=[],
)

# Iterate through cues and request ratings; stop after AnswerTime
k = 0
for cue in CUES_all:  
    try:
        # Send request and count successful responses
        gpta.gpt_associations(cue)  
        k += 1  

        if k >= AnswerTime: 
            break

        # brief pause between requests to avoid rate limits (0.3s)
        time.sleep(0.3) 
    except Exception as e:
        # log exceptions and continue
        print(f"发生错误：{e}")