In [1]:
import os
import json
import pandas as pd

In [2]:
df_answers = pd.read_csv('reddit_answers_big.csv', sep=';')

df_answers.head(3)

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0


In [3]:
# Grabbing the top answers
df_top_votes = df_answers.groupby('q_id')['votes'].idxmax()
df_top_answers = df_answers.loc[df_top_votes]

df_top_answers.rename(columns={'text': 'answer'}, inplace=True)
df_top_answers.rename(columns={'q_id': 'id'}, inplace=True)
df_top_answers.rename(columns={'votes': 'answer_votes'}, inplace=True)

df_top_answers.head(3)

Unnamed: 0.1,Unnamed: 0,id,answer,answer_votes
1817014,1875645,1001ag,Tell him to go to a hospital. I can't stress t...,30.0
1591462,1643710,10029x,NOTE: Detail may not sum to totals because of ...,3.0
96052,99426,1004g5,Blow Me Away by Breaking Benjamin http://www....,7.0


In [4]:
# Reading in the questions
df_questions = pd.read_csv('reddit_questions.csv', sep=';')

# Renaming some columns
df_questions.rename(columns={'text': 'question'}, inplace=True)
df_questions.rename(columns={'votes': 'question_votes'}, inplace=True)

df_questions.head(3)

Unnamed: 0,id,question,question_votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC


In [5]:
# Joining the questions and answers and ID
merged_df = df_questions.merge(df_top_answers, on='id')

merged_df.head(3)

Unnamed: 0.1,id,question,question_votes,timestamp,datetime,Unnamed: 0,answer,answer_votes
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC,1254710,Breed and die.,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC,4217572,The secret to quitting smoking is to tell your...,4.0
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC,5464942,No. My last name sounds badass.,4.0


In [6]:
# Clean this data up a bit
merged_df.drop(['timestamp', 'datetime', 'Unnamed: 0'], axis=1, inplace=True)

merged_df = merged_df.reindex(columns=['id', 'question',  'answer','question_votes', 'answer_votes'])

merged_df.head(3)

Unnamed: 0,id,question,answer,question_votes,answer_votes
0,izucgz,What's the purpose of life?,Breed and die.,8,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",The secret to quitting smoking is to tell your...,11,4.0
2,iylxwl,"For those who have a slave master last name, w...",No. My last name sounds badass.,0,4.0


In [7]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181311 entries, 0 to 181310
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              181311 non-null  object 
 1   question        181311 non-null  object 
 2   answer          181311 non-null  object 
 3   question_votes  181311 non-null  int64  
 4   answer_votes    181311 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 8.3+ MB


In [8]:
# Downsizing the data
merged_df = merged_df.sort_values(by='answer_votes', ascending=False)
merged_df_1k = merged_df[:1000]
merged_df_1k.head(3)

Unnamed: 0,id,question,answer,question_votes,answer_votes
75274,fkzaca,What is something that has aged well?,The word cool,66093,99398.0
167081,a0a4cd,What's the most amazing thing about the universe?,"It must be true that either It didn't exist, ...",81862,86042.0
140939,d0jjc2,The 2010's decade will be over in 4 months. Wh...,The social media explosion,113254,85936.0


In [9]:
# Fine-tining format
questions, answers = merged_df_1k['question'], merged_df_1k['answer']

In [10]:
qa_openai_format = [{"messages":[{"role":"system", "content": "Marv is a factual chatbot and reddit expert who likes to answer with bullets."},
                                 {"role": "user", "content": q},
                                 {"role": "assistant", "content": a}]} for q, a in zip(questions, answers)]

qa_openai_format[5]

{'messages': [{'role': 'system',
   'content': 'Marv is a factual chatbot and reddit expert who likes to answer with bullets.'},
  {'role': 'user',
   'content': "What is the greatest comeback to a insult you've ever heard?"},
  {'role': 'assistant',
   'content': '"Joe Pyne interviewing Frank Zappa Joe: "I guess your long hair makes you a woman." FZ: "I guess your wooden leg makes you a table." "'}]}

In [11]:
with open("training_data.jsonl", "w") as f:
    for entry in qa_openai_format:
        f.write(json.dumps(entry))
        f.write("\n")

In [12]:
from collections import defaultdict

data_path = "training_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
  dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Num examples: 1000
No errors found


In [None]:
# Importing OpenAI 
import openai
from openai import OpenAI

# os.environ["OPENAI_API_KEY"] = "API KEY HERE"
# openai.api_key=os.getenv('OPENAI_API_KEY')

client = OpenAI()

In [None]:
client.fine_tuning.jobs.create(
    training_file='file-UVutpxsPXJXlBkGu6xXDZoVe',
    model= "gpt-3.5-turbo"
)

In [None]:
client.fine_tuning.jobs.list()

In [None]:
system_prompt = "Marv is a factual chatbot and reddit expert who likes to answer with bullets"
user_question = "Give me the dumbest thing you've ever done."

In [None]:
response = client.chat.completions.create(
    model = "ft:gpt-3.5-turbo-0613:personal::8WsMPAem",
    messages = [
        {'role':'system', "content": system_prompt},
        {'role':'user', "content": user_question}
    ]
)

print(response.choices[0].message.content)