In [1]:
from ChatPodcastGPT import *
import nltk
import re
import collections
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import concurrent.futures

In [2]:
class ArxivEpisode(Episode):

    ArxivPart = collections.namedtuple('ArxivPart', 'title text')

    def __init__(self, arxiv_id, model='gpt-3.5-turbo', **kwargs):
        self.arxiv_id = arxiv_id
        self.model = model
        self.arxiv_data = self.process_pdf(self.arxiv_id)
        self.arxiv_title = self.get_title(self.arxiv_id)
        self._kwargs = kwargs
        super().__init__(self, topic=self.arxiv_title, **kwargs)

    def parse_pdf(self, file):
        with open(file, "rb") as f:
            pdf = PdfReader(f)
            text = ""
            for page in range(len(pdf.pages)):
                text += pdf.pages[page].extract_text()
        return text

    def split_into_parts(self, text, max_tokens=4096//2):
        # Split the text into lines
        lines = text.split("\n")

        # Group lines into parts based on whether they look like a title
        parts = []
        current_part = []
        current_title = 'Abstract'
        for line in lines:
            # If the line starts with a number followed by a space and then text, start a new part
            if re.match(r'\d+\s[A-Za-z]', line):
                # Save the current part if it's not empty
                if current_part:
                    parts.append(self.ArxivPart(current_title, "\n".join(current_part)))
                current_title = line
                current_part = []
            else:
                current_part.append(line)
            while Chat.num_tokens_from_text('\n'.join(current_part)) > max_tokens:
                text = '\n'.join(current_part)
                shortened_part, current_part = text[:max_tokens*2], [text[max_tokens*2:]]
                parts.append(self.ArxivPart(current_title, shortened_part))

        # Save the last part if it's not empty
        if current_part:
            parts.append(self.ArxivPart(current_title, "\n".join(current_part)))
        return parts

    def process_pdf(self, arxiv_id):
        with tempfile.TemporaryDirectory() as tmpdir:
            file = os.path.join(tmpdir, "file.pdf")
            self.arxiv_download(arxiv_id, file)
            text = self.parse_pdf(file)
        parts = self.split_into_parts(text)
        return parts
    
    def arxiv_download(self, arxiv_id, out_file):
        url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
        response = requests.get(url)
        with open(out_file, "wb") as f:
            f.write(response.content)
    
    def get_title(self, arxiv_id):
        url = f"https://arxiv.org/abs/{arxiv_id}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('h1', {'class': 'title mathjax'}).text.strip().split('\n')[-1].strip()
        return title

    def write_one_part(self, chat_msg):
        chat = PodcastChat(topic=self.arxiv_title, **self._kwargs)
        msg, aud = chat.step(msg=chat_msg, model=self.model)
        return msg, aud
    
    def step(self):
        include = f" Remember to respond with the hosts names like {self.chat._hosts[0]}: and {self.chat._hosts[1]}:"
        outline = self.arxiv_data[0].text
        logger.info(f"Outline: {outline}")
        intro_msg = f"Write the intro for a podcast about a paper: {self.arxiv_title}. The abstract for the paper is {outline}. Only write the introduction.{include}"

        # Get parts
        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as tpe:
            jobs = [tpe.submit(self.write_one_part, intro_msg)]
            jobs.extend([
                tpe.submit(self.write_one_part, f"Rewrite the text from the paper {self.arxiv_title} part {part.title} into a podcast section. Explain everything other than the title as if the listener has no idea. Do not include any intro such as saying welcome back, just get right to it. The text in the paper is: {part.text}.{include}")
                for part in self.arxiv_data
            ])
            job2idx = {j:i for i, j in enumerate(jobs)}
            self.sounds = [None] * len(jobs)
            self.texts  = [None] * len(jobs)
            for i, job in enumerate(concurrent.futures.as_completed(jobs)):
                logger.info(f"Part: {i} / {len(jobs)} = {100.0*i/len(jobs):,.5f}%")
                jobid = job2idx[job]
                text, sound = job.result()
                self.sounds[jobid], self.texts[jobid] = sound, text
        return outline, '\n'.join(self.texts)

In [3]:
%%time
model='gpt-3.5-turbo'
arxiv_id = '2305.11873'
host_voices=[GttsTTS(GttsTTS.MAN), GttsTTS(GttsTTS.WOMAN)]
a = ArxivEpisode(arxiv_id, model=model, host_voices=host_voices)
outline, txt = a.step()
a.upload('Arxiv Test ' + a.arxiv_title, f'ChatGPT generated podcast using {model=} for https://arxiv.org/abs/{arxiv_id}')

[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mOutline: Paweł Niszczota1,*, Paul Conway2 [0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:20:22[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-05-26 10:21:01[0m [[32m[1minfo     [0m] [1mreceived openai...

[2m2023-05-26 10:21:02[0m [[32m[1minfo     [0m] [1mPart: 0 / 10 = 0.00000%[0m
[2m2023-05-26 10:21:02[0m [[32m[1minfo     [0m] [1mreceived tts i=0[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mreceived tts i=4[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mreceived tts i=1[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mreceived tts i=3[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mreceived tts i=5[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mreceived tts i=6[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mconcatting audio[0m
[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mdone with audio![0m


[2m2023-05-26 10:21:03[0m [[32m[1minfo     [0m] [1mPart: 1 / 10 = 10.00000%[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=3[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=4[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=5[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=6[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=7[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=8[0m
[2m2023-05-26 10:21:05[0m [[32m[1minfo     [0m] [1mrequesting tts i=9[0m
[2m2023-05-26 10:21:05[0m [[32m

[2m2023-05-26 10:21:07[0m [[32m[1minfo     [0m] [1mPart: 2 / 10 = 20.00000%[0m
[2m2023-05-26 10:21:07[0m [[32m[1minfo     [0m] [1mreceived tts i=6[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=0[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=5[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=2[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=3[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=1[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mreceived tts i=4[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mconcatting audio[0m
[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mdone with audio![0m


[2m2023-05-26 10:21:08[0m [[32m[1minfo     [0m] [1mPart: 3 / 10 = 30.00000%[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=3[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=4[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=5[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=6[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=7[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=8[0m
[2m2023-05-26 10:21:09[0m [[32m[1minfo     [0m] [1mrequesting tts i=9[0m
[2m2023-05-26 10:21:09[0m [[32m

[2m2023-05-26 10:21:10[0m [[32m[1minfo     [0m] [1mPart: 4 / 10 = 40.00000%[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=3[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=4[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=5[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=6[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=7[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=8[0m
[2m2023-05-26 10:21:11[0m [[32m[1minfo     [0m] [1mrequesting tts i=9[0m
[2m2023-05-26 10:21:12[0m [[32m

[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mPart: 5 / 10 = 50.00000%[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=7[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=9[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=11[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=2[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=6[0m
[2m2023-05-26 10:21:13[0m [[32m[1minfo     [0m] [1mreceived tts i=8[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     [0m] [1mreceived tts i=4[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:14[0m [[32m[1minfo     

[2m2023-05-26 10:21:16[0m [[32m[1minfo     [0m] [1mPart: 6 / 10 = 60.00000%[0m
[2m2023-05-26 10:21:16[0m [[32m[1minfo     [0m] [1mreceived tts i=10[0m
[2m2023-05-26 10:21:16[0m [[32m[1minfo     [0m] [1mconcatting audio[0m
[2m2023-05-26 10:21:16[0m [[32m[1minfo     [0m] [1mdone with audio![0m


[2m2023-05-26 10:21:16[0m [[32m[1minfo     [0m] [1mPart: 7 / 10 = 70.00000%[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=3[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=4[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=5[0m
[2m2023-05-26 10:21:23[0m [[32m[1minfo     [0m] [1mrequesting tts i=6[0m
[2m2023-05-26 10:21:24[0m [[32m[1minfo     [0m] [1mreceived tts i=6[0m
[2m2023-05-26 10:21:25[0m [[32m[1minfo     [0m] [1mreceived tts i=1[0m
[2m2023-05-26 10:21:25[0m [[32m[1minfo     [0m] [1mreceived tts i=3[0m
[2m2023-05-26 10:21:25[0m [[32m[1min

[2m2023-05-26 10:21:25[0m [[32m[1minfo     [0m] [1mPart: 8 / 10 = 80.00000%[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=0[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=1[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=2[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=3[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=4[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=5[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=6[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=7[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mrequesting tts i=8[0m
[2m2023-05-26 10:21:48[0m [[32m[1minfo     [0m] [1mreceived tts i=8[0m
[2m2023-05-26 10:21:49[0m [[32m[

[2m2023-05-26 10:21:50[0m [[32m[1minfo     [0m] [1mPart: 9 / 10 = 90.00000%[0m
CPU times: user 5.71 s, sys: 1.45 s, total: 7.16 s
Wall time: 1min 41s


In [4]:
"""TODO:
Fix out of order bugs
Repeat less
"""

'TODO:\nFix out of order bugs\nRepeat less\n'