## 라이브러리 & API Key 정의

In [2]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import re

load_dotenv() # .env 파일 로드
my_api_key = os.getenv("API_KEY") # 환경 변수에서 API 키 불러오기

client = OpenAI(
    api_key = my_api_key
)

## Instuction 정의

In [2]:
# negative용
instruction_1 = """
You are a recommendation bot responsible for selecting the news article that the target user is most likely to prefer from a list of five candidate articles. The only information available for each candidate article is its title, which is written in Norwegian.

Your goal is to predict the index number of the news article that best fits in the position labeled [MASK].
"""

instruction_negative = """
You are a bot that identifies users' news interests from [News of Interest to the user], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News is provided by title only.
News is Norwegian news in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
The answer should return only one news that the user is most likely to read.
"""

# category용
instruction_negative = """
You are a bot that identifies users' news interests from [News of Interest to the user], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News only provides a title and category.
News is Norwegian news in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
The answer should return only one news that the user is most likely to read.
"""

# category용 + ndcg
instruction_negative = """
You are a bot that identifies users' news interests from [News of Interest to the user], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News only provides a title and category.
News is Norwegian news in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
The answers should sort the index numbers of the news articles in the order that you think best fits the [MASK] based on the user's preferences (only the top 10 will be returned).
"""

# category용 + ndcg2
instruction_negative = """You are a bot that identifies users' news interests from [News of Interest to the user], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News only provides a title and category.
News is Norwegian news in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered. """



instruction_positive = """
You are a bot that identifies users' news interests from [Click History], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News is provided by title only.
News is Norwegian news in Norwegian.

[Questions] can have multiple questions, each of which must be answered.
The answer should return only one news that the user is most likely to read.
"""

# category용 + ndcg
instruction_positive = """
You are a bot that identifies users' news interests from [Click History], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News only provides a title and category.
News is Norwegian news in Norwegian.

[Questions] can have multiple questions, each of which must be answered.
The answers should sort the index numbers of the news articles in the order that you think best fits the [MASK] based on the user's preferences (only the top 10 will be returned).
"""

# category용 + ndcg2
instruction_positive = """You are a bot that identifies users' news interests from [Click History], then based on this, predicts the index number of news in [Questions] that best fits in the position labeled [MASK].

News only provides a title and category.
News is Norwegian news in Norwegian.

[Questions] can have multiple questions, each of which must be answered. """

In [3]:
# 0217 ndcg
instruction_negative = """
You are a bot designed to identify users' news interests based on their [News of Interest to the user] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

There can be multiple lists in [News of Interest to the user], each with five news items.
Among the five news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
"""


instruction_positive = """
You are a bot designed to identify users' news interests based on their [Click History] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

[Questions] can have multiple questions, each of which must be answered.

<Input example 1>
[Click History]
The news articles that User #15001 clicked before are as follows:
1. Gravid i femte måned - vant NM-finalen [category : ballsport]
2. Trafikkuhell i Vemundvik [category : nordtrondelag]
3. Norgesgruppen kjøper seg inn i grensehandelkjede [category : okonomi]
...

[Questions]
Based on User #15001's preferences, arrange the index numbers of the top five news articles in the sequence that is deemed most suitable for [MASK]

Question 1) User #15001 prefers [MASK] the most among the following five articles: 
1: Silver ber om midlertidig forbud mot offentlig administrasjon [category : okonomi]
2: Perfekt gripefølelse [category : digital]
3: Svenssons debut i Nederland utsettes: Ikke i kveldens Europa League-tropp [category : fotball]
4: Nye tall: Inntil 91 ulver i Norge [category : innenriks]
5: Trygg Trafikk vil ha alkolås i alle biler [category : innenriks]

<Output example 1> 
Question 1: 3, 1, 4, 5, 2

<Input example 2>
[Click History]
The news articles that User #15002 clicked before are as follows:
1. Nå åpner byens nye turløype [category : trondheim]
2. - Jeg har litt erfaring med steinhugging fra før [category : trondheim]
3. «Det mest ettertraktede området i vårt nærområde står i fare for å bli avstengt for allmennheten» [category : ordetfritt]
...

[Questions]
Based on User #15002's preferences, arrange the index numbers of the top five news articles in the sequence that is deemed most suitable for [MASK]

Question 1) User #15002 prefers [MASK] the most among the following five articles: 
1: eAdressa er oppdatert [category : okonomi]
2: Arkitekter reagerer på nytt boligforslag [category : innenriks]
3: Han er en av 13 som får si sitt om fremtidens hær og heimevern [category : nyheter]
4: Hun får halve verden til Trondheim [category : magasin]
5: Kjenner jeg ekstra godt etter så tror jeg at jeg føler meg litt sånn «hin» [category : ordetfritt]

<Output example 2> 
Question 1: 5, 2, 3, 1, 4

<Input example 3>
[Click History]
The news articles that User #15003 clicked before are as follows:
1. Nå åpner byens nye turløype [category : trondheim]
2. Orkangers keiserinne [category : sortrondelag]
3. Mathilde (3) måtte returnere til St. Olavs Hospital etter en dag på Værnes [category : nordtrondelag]
...

[Questions]
Based on User #15003's preferences, arrange the index numbers of the top five news articles in the sequence that is deemed most suitable for [MASK]

Question 1) User #15003 prefers [MASK] the most among the following five articles: 
1: 19-åring tiltalt for gjengvoldtekt i Tromsø [category : innenriks]
2: Innbrudd hos bedrift i Osloveien [category : trondheim]
3: Som vekter i tolv år har jeg fått et ubehagelig innsyn i hva unge spiser i langfriminuttet [category : ordetfritt]
4: Kvinne døde i påkjørsel på E18 ved Horten [category : innenriks]
5: Treneren etter den norske fiaskoen: - Det er nitrist [category : vintersport]

<Output example 3> 
Question 1: 3, 2, 1, 5, 4
"""

In [3]:
# 0204 acc
instruction_negative = """
You are a bot designed to identify users' news interests based on their [News of Interest to the user] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

There can be multiple lists in [News of Interest to the user], each with 5 news items.
Among the 5 news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.
"""



instruction_positive = """
You are a bot designed to identify users' news interests based on their [Click History] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

[Questions] can have multiple questions, each of which must be answered.
"""


instruction_negative = """
You are a bot designed to identify users' news interests based on their [News of Interest to the user] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].

Each news article contains only a title and category written in Norwegian.

There can be multiple lists in [News of Interest to the user], each with 5 news items.
Among the 5 news in each list, there is one news that the user is most interested in.

[Questions] can have multiple questions, each of which must be answered.

<Q>
[News of Interest to the user]

1) User #10094 prefers most To biler involvert i trafikkulykke ved Støren [category : sortrondelag] among the following 5 articles:
1: To biler involvert i trafikkulykke ved Støren [category : sortrondelag]
2: Sunday Times: Trump vil møte Putin på Island [category : utenriks]
3: Norge vurderer å endre straffestrategi etter bom-bonanza [category : ballsport]
4: Katrine snakker mest med unger og fulle folk [category : magasin]
5: Iraner fikk ett års fengsel etter handletur til Sverige [category : innenriks]

2) User #10094 prefers most «Northug gjør narr av skiledelsen» [category : vintersport] among the following 5 articles:
1: Jensen fikk en halv million av politiet – etter pågripelsen [category : innenriks]
2: Sverresborg er best i klassen [category : nyheter]
3: Jan Guldahl på plass for å hylle Donald Trump [category : utenriks]
4: Dersom du er syk og trenger behandling bør du gå til legen, ikke Snåsamannen [category : ordetfritt]
5: «Northug gjør narr av skiledelsen» [category : vintersport]

3) User #10094 prefers most Toget kjørte på hund [category : nordtrondelag] among the following 5 articles:
1: Jensen fikk en halv million av politiet – etter pågripelsen [category : innenriks]
2: Sverresborg er best i klassen [category : nyheter]
3: Jan Guldahl på plass for å hylle Donald Trump [category : utenriks]
4: Dersom du er syk og trenger behandling bør du gå til legen, ikke Snåsamannen [category : ordetfritt]
5: Toget kjørte på hund [category : nordtrondelag]

4) User #10094 prefers most Jørn kom i stor fart ned skiløypa - så gikk sporet ut på strødd vei [category : trondheim] among the following 5 articles:
1: Tidligere topputøver hevder idrettsstjerner vil boikotte Trump [category : idrettspolitikk]
2: Norge til kvartfinale etter fantastisk omgang: - Ingenting er umulig for denne gjengen [category : ballsport]
3: Jørn kom i stor fart ned skiløypa - så gikk sporet ut på strødd vei [category : trondheim]
4: Over 90 personer pågrepet i Washington [category : utenriks]
5: Politiet fant store mengder partydop nedgravd i Trondheim [category : trondheim]

5) User #10094 prefers most Northug langt bak i sitt første renn på nesten to måneder [category : vintersport] among the following 5 articles:
1: Mener nivået på ishockey i Bergen må heves betraktelig for å fortjene ny arena [category : idrettspolitikk]
2: Northug langt bak i sitt første renn på nesten to måneder [category : vintersport]
3: Nå må vi slutte å bable om trege trøndere [category : meninger]
4: Frivillig overvåking kan gi billigere forsikring [category : innenriks]
5: Knallsterkt comeback av Byåsen i Europacupen [category : ballsport]

6) User #10094 prefers most Mann i 60-årene omkom i ulykke i Overhalla [category : nordtrondelag] among the following 5 articles:
1: Mann i 60-årene omkom i ulykke i Overhalla [category : nordtrondelag]
2: Dette kan gjøres for å unngå overvekt [category : sprek]
3: TV 2 relanserer tippekampen: Sender én Premier League-kamp gratis hver runde [category : fotball]
4: - USA vil oppleve at proteksjonisme har en pris [category : okonomi]
5: Alle de døde funnet etter snøras i Italia [category : utenriks]

7) User #10094 prefers most Boblebad med denne jenta snudde Northugs humør [category : vintersport] among the following 5 articles:
1: Fredriksen og Røkke skaper ny offshore-gigant [category : innenriks]
2: Gro våknet en morgen og var supersvimmel - da ble hun livredd [category : nyheter]
3: Brann-spillerens Facebook-innlegg varmer fansen [category : fotball]
4: Boblebad med denne jenta snudde Northugs humør [category : vintersport]
5: Heltent Jurisic stengte buret til FyllingenBergen [category : ballsport]

[Questions]
Based on User #10094's preferences, predict the index number of the news article that best fits the position labeled [MASK] for each question.

Question 1) User #10094 prefers most [MASK] among the following 5 articles:
1: Flere breiflabber skylles i land i Trøndelag [category : sortrondelag]
2: Veltet vogntog sperret E6 i Grong [category : nordtrondelag]
3: Stol aldri på en trønder - i alle fall ikke når det kommer til mat [category : meninger]
4: Mann skadd i fallulykke i Trondheim sentrum [category : trondheim]
5: Beslagla tyvegods i kjellerbod etter innbrudd på Munkvoll [category : trondheim]

Please provide just the answers to each of User #10094's question without any explanations.

<A>
Question 1 : 2

"""

In [5]:
instruction_negative

"\nYou are a bot designed to identify users' news interests based on their [News of Interest to the user] and predict the index number of news items in [Questions] that best fit the position labeled [MASK].\n\nEach news article contains only a title and category written in Norwegian.\n\nThere can be multiple lists in [News of Interest to the user], each with 2 news items.\nAmong the 2 news in each list, there is one news that the user is most interested in.\n\n[Questions] can have multiple questions, each of which must be answered.\n"

## inference 함수 정의

In [4]:
def inference(purpose, target_folder, result_file_name, gpt_model, user_list, max_attempts):

    # instruction 정의
    if purpose == 'with_negative': 
        instruction = instruction_negative
    elif purpose == 'only_positive':
        instruction = instruction_positive

    # User Prompt가 위치한 폴더 및 metadata 파일 경로 설정
    target_folder = f'../../prompts/{target_folder}'
    directory = f'{target_folder}/{purpose}'
    meta_file_path = f'{target_folder}/{purpose}/metadata/output_metadata.txt'
    user_question_counts = {}
    
    # metadata 파일을 읽어 user별 question 수 저장
    with open(meta_file_path, 'r', encoding='utf-8') as meta_file:
        for line in meta_file:
            match = re.match(r'User ID:\s*U(\d+).*Question 수:\s*(\d+)', line)
            if match:
                user_id = int(match.group(1))
                question_count = int(match.group(2))
                user_question_counts[user_id] = question_count

    # 실험 실행
    with open(f'../../results/gpt_result/{result_file_name}', 'w', encoding='utf-8') as result_file:
        # user list에서 각 user에 대해 처리
        for cnt, i in enumerate(user_list):
            filename = f'U{i}.txt'
            filepath = os.path.join(directory, filename)
            
            # 파일 존재 여부 확인
            if os.path.isfile(filepath):
                # 파일 내용 읽기
                with open(filepath, 'r', encoding='utf-8') as f:
                    contents = f.read()

                # user의 question 수 설정
                expected_question_count = user_question_counts.get(i)
                if expected_question_count is None:
                    print(f"사용자 U{i}의 질문 수를 찾을 수 없습니다.")
                    continue  # 다음 사용자로 넘어감
                
                # API 요청 준비
                initial_messages = [
                    {"role": "system", "content": instruction},
                    {"role": "user", "content": contents}
                ]
                messages = initial_messages.copy()
                attempt = 0

                # 최대 시도 횟수를 넘지 않았으면 실행
                while attempt < max_attempts:
                    attempt += 1
                    # API 호출
                    try:
                        response = client.chat.completions.create(
                            model=gpt_model,
                            messages=messages
                        )
                    except Exception as e:
                        print(f"API 호출 중 오류 발생 (사용자 {i}): {e}")
                        break  # 다음 사용자로 넘어감
                    
                    # 응답 내용 추출
                    response_text = response.choices[0].message.content.strip()
                    
                    
                    result_file.write(f'[U{i}]\n')
                    result_file.write(response_text + '\n\n')
                    if (cnt+1) % 20 == 0:
                        print(f'☆ {purpose} U{i} 까지 완료 [{cnt+1}/{len(user_list)}] ☆')  
                    break  # 루프 종료
                    
            else:
                print(f'파일 {filepath} 이 존재하지 않습니다.')
        print(f'{purpose} 완료 : {result_file_name}\n')


In [6]:
user_range = 1000
users = [i for i in range(1, user_range + 1)]

# users =  [15168, 15473]

# 실행


users = [111, 132, 235, 363, 564, 736, 836, 876, 894, 951]

# inference(purpose='only_positive', 
#           target_folder='[top1] test_ns4',
#           result_file_name='[250310] positive_ns4_fine(40,15)_across(negative_model)2.txt',
#           gpt_model='ft:gpt-4o-mini-2024-07-18:personal:across-users-ns4-40-15-negative-random:B9ROFM5c:ckpt-step-80', 
#           user_list=users, 
#           max_attempts=1
#           )

inference(purpose='with_negative', 
          target_folder='[top1] test_ns4 (train ns4)',
          result_file_name='[250313] negative_ns4_(one_shot).txt',
          gpt_model='gpt-4o-mini', 
          user_list=users, 
          max_attempts=1
          )

with_negative 완료 : [250313] negative_ns4_(one_shot).txt

