In [23]:
%%capture
!pip install openai groq

In [29]:
import time
import tqdm
import pandas as pd
from groq import Groq
import json
import os
import random

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
from google.colab import userdata

In [5]:
with open("/content/drive/MyDrive/data/chunked_data.json", "rt") as f:
    data = json.load(f)

In [6]:
len(data)

16397

In [7]:
sample_index_tuple = tuple(random.randint(0, 16396) for _ in range(750))
sample_index_tuple[:5]

(5956, 1674, 9225, 6534, 11217)

In [8]:
data[sample_index_tuple[-1]]

{'id': 698,
 'title': '#698: Dr. Mark Plotkin on Coffee, the World’s Favorite Stimulant — Chemistry, History, and More',
 'url': 'https://tim.blog/2023/10/16/story-of-coffee-transcript/',
 'chunk_id': '698_6',
 'chunk': 'in the year 1804. With the loss of Santo Domingo as a major coffee exporter, other Central and South American countries began planting coffee. Ever since, coffee has played a major role in tropical American countries, contributing to economic development, employment, export revenues, and other benefits. However, the flip side of the story is a negative one. Establishment and expansion of coffee plantations has usually been at the expense of tropical rainforest, and the economic yields have typically been concentrated at the very top of the economic pyramid, as is often the case in the capitalist system, the rich got richer and the poor got a lot poorer. A highly recommended account of this is detailed in the book Coffeeland by Augustine Sedgewick. The author details ho

In [9]:
prompt_template = """
You emulate a user of our podcast transcript app.
Formulate 5 questions this user might ask based on a provided transcript excerpt.
Make the questions specific to the excerpt, especially the excerpt text.
The excerpt text should contain the answer to the questions, and the questions should be complete and not too short. Use as fewer words as possible from the excerpt text.

The excerpt:

id: {id}
title: {title}
part: {chunk_id}
text: {chunk}

Provide the output in parsable JSON without using code blocks. Remember to put each question between "" and separate with `,`:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [10]:
prompt = prompt_template.format(**data[0])
prompt

'You emulate a user of our podcast transcript app.\nFormulate 5 questions this user might ask based on a provided transcript excerpt.\nMake the questions specific to the excerpt, especially the excerpt text.\nThe excerpt text should contain the answer to the questions, and the questions should be complete and not too short. Use as fewer words as possible from the excerpt text.\n\nThe excerpt:\n\nid: 1\ntitle: #1: Kevin Rose\npart: 1_0\ntext: The Tim Ferriss Show Transcripts Episode 1: Kevin Rose Show notes and links at tim.blog/ podcast Tim Ferriss: This is Episode 1 of the Tim Ferriss podcast. For those of you who don’t have any context on me, I’m the author of The 4-Hour Workweek, The 4-Hour Body, and The 4-Hour Chef, which have been translated into more than 35 languages, and all three books are comprised of self-experiments. I travel the world and find people who are world-class in different skills, and then I try to dissect that skill and test it on myself. That can range from per

In [25]:
client = Groq(
    api_key=userdata.get("GROQ_API_TOKEN"),
)

In [27]:
def llm(prompt):
    return (
        client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[{"role": "user", "content": prompt}],
        )
        .choices[0]
        .message.content
    )

In [17]:
questions = llm(prompt)

In [18]:
json.loads(questions)

{'questions': ['What percentage of results are achieved through 20% of tactics, philosophies, or principles used by world-class individuals?',
  'What are the critical few things versus the trivial many that Tim Ferriss aims to suss out through his podcast?',
  "How will the podcast differ from Tim Ferriss' previous project, The Random Show?",
  'What is the advantage of the high bit rate audio quality in this podcast?',
  'What inspired Tim Ferriss to create this podcast and invite Kevin Rose as his first guest?']}

In [19]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    json_response = llm(prompt)
    return json_response

In [None]:
results = {}

In [32]:
for doc_id in tqdm.tqdm(sample_index_tuple):
    if doc_id in results:
        continue

    questions_raw = generate_questions(data[doc_id])
    try:
        questions = json.loads(questions_raw)
        results[doc_id] = questions["questions"]
    except Exception as e:
        pass

100%|██████████| 750/750 [30:31<00:00,  2.44s/it]


In [33]:
final_results = []

for doc_id, questions in tqdm.tqdm(results.items()):
    for q in questions:
        final_results.append((doc_id, data[doc_id]["chunk_id"], q))

100%|██████████| 673/673 [00:00<00:00, 109664.59it/s]


In [34]:
df_results = pd.DataFrame(final_results, columns=["id", "chunk_id", "question"])

In [35]:
df_results.to_csv("/content/drive/MyDrive/data/ground-truth-retrieval.csv", index=False)

In [36]:
!head /content/drive/MyDrive/data/ground-truth-retrieval.csv

id,chunk_id,question
5956,290_20,What are the exact resolutions that Gretchen Rubin is referring to in relation to her daily habits?
5956,290_20,What is the name of Gretchen Rubin's book that 'did not find its audience'?
5956,290_20,What feeling of helplessness did Gretchen Rubin experience as a result of her book being a commercial failure?
5956,290_20,What specific tools was Gretchen Rubin only able to access after her book failed?
5956,290_20,What is the name of the experiment that Gretchen Rubin was testing in her book The Happiness Project?
1674,86_18,W
1674,86_18,h
1674,86_18,a
1674,86_18,t


In [42]:
wrong_format_df = df_results.loc[df_results["question"].str.len() < 5]
wrong_format_df

Unnamed: 0,id,chunk_id,question
5,1674,86_18,W
6,1674,86_18,h
7,1674,86_18,a
8,1674,86_18,t
9,1674,86_18,
...,...,...,...
2373,12661,591_10,h
2374,12661,591_10,i
2375,12661,591_10,n
2376,12661,591_10,e


In [51]:
correct_format_df = df_results.loc[~(df_results["question"].str.len() < 5)]
correct_format_df

Unnamed: 0,id,chunk_id,question
0,5956,290_20,What are the exact resolutions that Gretchen R...
1,5956,290_20,What is the name of Gretchen Rubin's book that...
2,5956,290_20,What feeling of helplessness did Gretchen Rubi...
3,5956,290_20,What specific tools was Gretchen Rubin only ab...
4,5956,290_20,What is the name of the experiment that Gretch...
...,...,...,...
4425,15357,698_6,Which Central and South American country repla...
4426,15357,698_6,"According to Pendergrast's book, what percenta..."
4427,15357,698_6,What group of people experienced financial har...
4428,15357,698_6,"Which country, despite favorable geography for..."


In [44]:
df_concat = (
    wrong_format_df.groupby(["id", "chunk_id"])["question"]
    .apply(lambda x: "".join(x))
    .reset_index()
)

Unnamed: 0,id,chunk_id,question
0,1674,86_18,What was unique about the tunnels Stanley McCh...
1,12661,591_10,What specific elements contributed to Tom More...


In [46]:
df_concat["question_split"] = df_concat["question"].apply(
    lambda x: [q.strip() for q in x.split("?") if q]
)
df_exploded = df_concat.explode("question_split").reset_index(drop=True)
df_exploded["question_split"] = df_exploded["question_split"] + "?"
df_final = df_exploded[["id", "chunk_id", "question_split"]].rename(
    columns={"question_split": "question"}
)
df_final["question"] = df_final["question"].apply(
    lambda x: x[2:] if x.startswith(", ") else x
)

In [50]:
df_final

Unnamed: 0,id,chunk_id,question
0,1674,86_18,What was unique about the tunnels Stanley McCh...
1,1674,86_18,How deep were Stanley McChrystal and his team ...
2,1674,86_18,What triggered a stand down by the coalition p...
3,1674,86_18,In what way did General McChrystal's participa...
4,1674,86_18,How much time did the coalition partner's heli...
5,12661,591_10,What specific elements contributed to Tom More...
6,12661,591_10,What was unique about The Nightwatchman perfor...
7,12661,591_10,What was the cause of Tom Morello's anxiety th...
8,12661,591_10,How does Tom Morello think the audience would ...
9,12661,591_10,What was the initial reason that led to Zack q...


In [52]:
final_df = pd.concat([correct_format_df, df_final], ignore_index=True)
final_df

Unnamed: 0,id,chunk_id,question
0,5956,290_20,What are the exact resolutions that Gretchen R...
1,5956,290_20,What is the name of Gretchen Rubin's book that...
2,5956,290_20,What feeling of helplessness did Gretchen Rubi...
3,5956,290_20,What specific tools was Gretchen Rubin only ab...
4,5956,290_20,What is the name of the experiment that Gretch...
...,...,...,...
3357,12661,591_10,What specific elements contributed to Tom More...
3358,12661,591_10,What was unique about The Nightwatchman perfor...
3359,12661,591_10,What was the cause of Tom Morello's anxiety th...
3360,12661,591_10,How does Tom Morello think the audience would ...


In [77]:
count_df = final_df.groupby(["id", "chunk_id"]).count()
count_df[count_df["question"] != 5]

Unnamed: 0_level_0,Unnamed: 1_level_0,question
id,chunk_id,Unnamed: 2_level_1
9730,466_25,1
15999,720_9,6


In [78]:
final_df.loc[final_df["id"].isin([9730, 15999])]

Unnamed: 0,id,chunk_id,question
1830,15999,720_9,What specific tasks does Soman Chainani's assi...
1831,15999,720_9,How does Soman Chainani feel when speaking to ...
1832,15999,720_9,What unusual art project does Christopher Marl...
1833,15999,720_9,How did Soman Chainani discover and become int...
1834,15999,720_9,What unique skill does Christopher Marley have...
1835,15999,720_9,How did Soman Chainani learn about Christopher...
2601,9730,466_25,What were the key findings of the book that st...


In [79]:
new_rows = (
    final_df[final_df["id"].isin([9730])]["question"]
    .str.split("?")
    .explode()
    .apply(lambda x: x[3:] if x.startswith('","') else x)
    .iloc[:-1]
)

# Replace the old row with the new rows
final_df = final_df[~final_df["id"].isin([9730])]  # Remove the old row
new_df = pd.DataFrame(
    {
        "id": [9730] * len(new_rows),
        "chunk_id": [466_25] * len(new_rows),
        "question": new_rows,
    }
)

# Append the new rows to the final DataFrame
final_df = pd.concat([final_df, new_df], ignore_index=True)

In [82]:
final_df

Unnamed: 0,id,chunk_id,question
0,5956,290_20,What are the exact resolutions that Gretchen R...
1,5956,290_20,What is the name of Gretchen Rubin's book that...
2,5956,290_20,What feeling of helplessness did Gretchen Rubi...
3,5956,290_20,What specific tools was Gretchen Rubin only ab...
4,5956,290_20,What is the name of the experiment that Gretch...
...,...,...,...
3361,9730,46625,What were the key findings of the book that st...
3362,9730,46625,Could Richard Koch have universally explained ...
3363,9730,46625,"Why was Malcolm Gladwell's thesis about 10,000..."
3364,9730,46625,"How many people out of 20, which Richard Koch ..."


In [83]:
final_df.to_csv("/content/drive/MyDrive/data/ground-truth-retrieval.csv", index=False)

In [84]:
!head /content/drive/MyDrive/data/ground-truth-retrieval.csv

id,chunk_id,question
5956,290_20,What are the exact resolutions that Gretchen Rubin is referring to in relation to her daily habits?
5956,290_20,What is the name of Gretchen Rubin's book that 'did not find its audience'?
5956,290_20,What feeling of helplessness did Gretchen Rubin experience as a result of her book being a commercial failure?
5956,290_20,What specific tools was Gretchen Rubin only able to access after her book failed?
5956,290_20,What is the name of the experiment that Gretchen Rubin was testing in her book The Happiness Project?
9225,434_17,What are the blocks to closeness that Jim Dethmer and Debbie identified in their relationship as per Jim Dethmer?
9225,434_17,Does Jim Dethmer recommend that all couples want to be close in their relationship?
9225,434_17,"Before living from clear yeses and clear nos from freedom and personal responsibility, what needs to be addressed first according to Jim Dethmer?"
9225,434_17,How did Jim Dethmer and Debbie describe their ideal re