# Idea is to take a paper / pdf and convert it to a lecture that explains it intuitively.

### Steps:
1. Download PDF and get text
2. Ask ChatGPT what concepts (in order) need to be understood to understand the paper
3. Create video Script
4. Create video images
5. Add audio on top of images
6. Join and done

### What to display in video?
* Ask Chat for slide text (md format or something) given a paragraph

### Video Outline:
1. Concepts / Building blocks
2. Paper summary
3. Each part of the paper (ask Chat for parts)
4. Conclusion and Implications

In [101]:
from ChatPodcastGPT import Chat, PodcastChat, GttsTTS
import collections
import concurrent.futures
import os
import feedparser
import structlog
import itertools
import enum
import io
import re
import json
import tempfile
import PyPDF2
from bs4 import BeautifulSoup
import requests
import retrying
import openai
import random
import IPython.display
import datetime
import PIL
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFont
import base64
from pydub import AudioSegment
import tempfile
from moviepy.editor import ImageClip, concatenate_videoclips, AudioFileClip
import numpy as np
import io
import subprocess
import os
import functools
import logging
import librosa
import soundfile as sf


MODEL = 'gpt-3.5-turbo-16k'
MAX_TOKENS = 12_000
JOIN_NUM_DEFAULT = 300
SPEAKER_NAMES = ['Alfred', 'Alice']
SPEAKER_VOICES = [GttsTTS(GttsTTS.MAN), GttsTTS(GttsTTS.WOMAN)]
MAX_WORKERS = 4
logging.getLogger('moviepy').setLevel(logging.CRITICAL)
logger = structlog.get_logger()
flatten_list = lambda a: list(itertools.chain(*[x for x in a]))

## 1. PDF to Text

In [2]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf = PyPDF2.PdfReader(pdf_file)

        text = ''
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            text += page.extract_text()

        return text

In [40]:
# paper_path = '/Users/jong/Downloads/Cell_20230725/Decoding-semantic-representations-in-mind-and-brai.pdf'
# paper_text = extract_text_from_pdf(paper_path)
# len(paper_text), paper_text[:1000]

In [4]:
def text_into_token_chunks(text, max_tokens=MAX_TOKENS // 2, smoothing=0):
    """Split the text into parts based on tokens."""
    sentences = text.replace('\n', '').split(".")
    all_parts = []
    current_part = []
    for sentence in sentences:
        current_part.append(sentence + '.')
        if Chat.num_tokens_from_text(' '.join(current_part)) > max_tokens:
            part_text = ' '.join(current_part[:-1])
            all_parts.append(part_text)
            current_part = current_part[-(smoothing+1):]

    if current_part:
        all_parts.append(' '.join(current_part[:-1]))
    return all_parts

In [41]:
# paper_sections = text_into_token_chunks(paper_text, smoothing=3)
# len(paper_sections)

# 2. Concepts needed for understanding

In [107]:
@retrying.retry(stop_max_attempt_number=5, wait_fixed=2000)
def get_concepts(paper_section):
    chat = Chat('''Given some text from a scientific journal, return a JSON formatted list containing a few prerequisite concepts needed for understanding the paper.
Only respond as a valid JSON list, and nothing else.'''.replace('\n', ' '), max_length=MAX_TOKENS)
    resp = chat.message(paper_section, model=MODEL)
    data = json.loads(resp)
    assert isinstance(data, list)
    return data

@retrying.retry(stop_max_attempt_number=5, wait_fixed=2000)
def merge_concepts(concepts):
    chat = Chat('''Given a list of concepts needed to understand a paper, reduce them to just 5 or fewer prerequisite concepts.
Only respond as a valid JSON list, and nothing else. Order the list from least to most complex.'''.replace('\n', ' '))
    resp = chat.message(str(concepts), model='gpt-3.5-turbo')
    data = json.loads(resp)
    assert isinstance(data, list)
    return data

def get_all_concepts(paper_sections):
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
        concepts = [
            concept
            for concepts in tpe.map(get_concepts, paper_sections)
            for concept in concepts
        ]
    return merge_concepts(concepts)

In [42]:
# paper_prereqs = get_all_concepts(paper_sections)
# paper_prereqs

# 3. Create Video Script

In [109]:
# Create video script
def get_script_for_concepts(concepts):
    chat = Chat(f'''Given the following prerequisite concepts needed to understand a scientific paper, write a script for a video that explains them in an intuitive way.
Assume there's two speakers, {' and '.join(SPEAKER_NAMES)}.
Prefix each character's lines with their name and a :, like the following.
{SPEAKER_NAMES[0]}: Hello everyone.
{SPEAKER_NAMES[1]}: Indeed, hello!
Do not include any other script syntax.
Do not include a conclusion.'''.replace('\n', ' '))
    text = chat.message(str(concepts))
    return text

def get_script_for_paper_section(paper_section):
    chat = Chat(f'''Given the following section of a scientific paper, write an educational script for a video that explains this in an intuitive way.
Assume there's two speakers, {' and '.join(SPEAKER_NAMES)}.
Prefix each character's lines with their name and a :, like the following.
{SPEAKER_NAMES[0]}: Hello everyone.
{SPEAKER_NAMES[1]}: Indeed, hello!
Do not include any other script syntax.'''.replace('\n', ' '), max_length=MAX_TOKENS)
    text = chat.message(str(paper_section), model='gpt-3.5-turbo-16k')
    return text

def get_entire_script(paper_prereqs, paper_sections, consolidate=False):
    all_scripts = [None] * (1+len(paper_sections))
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
        runs = [tpe.submit(get_script_for_concepts, paper_prereqs)]
        runs.extend([tpe.submit(get_script_for_paper_section, section) for section in paper_sections])
        for i, r in enumerate(concurrent.futures.as_completed(runs)):
            ridx = runs.index(r)
            all_scripts[ridx] = r.result()
            logger.info(f'Done with {i} / {len(runs)}')
    if consolidate:
        all_scripts = flatten_list(all_scripts)
        chat = Chat(f'''Consolidate the following scripts that go over a scientific paper in an intuitive way.
Make it less redundant, more fun, and only include one intro and outro.
Assume there's two speakers, {' and '.join(SPEAKER_NAMES)}.
Prefix each character's lines with their name and a :, like the following.
{SPEAKER_NAMES[0]}: Hello everyone.
{SPEAKER_NAMES[1]}: Indeed, hello!
Do not include any other script syntax.'''.replace('\n', ' '), max_length=MAX_TOKENS)
        text = chat.message(str(all_scripts), model='gpt-3.5-turbo-16k')
        all_scripts = [text]
    return all_scripts

In [43]:
# paper_script = get_entire_script(paper_prereqs, paper_sections)
# len(paper_script)

# 4. Video images

In [84]:
import time
import threading

class RateLimited:
    def __init__(self, max_per_minute):
        self.max_per_minute = max_per_minute
        self.current_minute = time.strftime('%M')
        self.lock = threading.Lock()
        self.calls = 0

    def __call__(self, fn):
        def wrapper(*args, **kwargs):
            run = False
            with self.lock:
                current_minute = time.strftime('%M')
                if current_minute != self.current_minute:
                    self.current_minute = current_minute
                    self.calls = 0
                if self.calls < self.max_per_minute:
                    self.calls += 1
                    run = True
            if run:
                return fn(*args, **kwargs)
            else:
                time.sleep(15)
                return wrapper(*args, **kwargs)
                    
        return wrapper

In [85]:
class AIImage:
    class Size(enum.Enum):
        SMALL = "256x256"
        MEDIUM = "512x512"
        LARGE = "1024x1024"

    @classmethod
    @RateLimited(45)
    @retrying.retry(stop_max_attempt_number=5, wait_fixed=2000)
    def create(cls, prompt, n=1, size=Size.SMALL):
        logger.info(f'requesting openai.Image {prompt}...')
        resp = openai.Image.create(prompt=prompt, n=n, size=size.value, response_format='b64_json')
        logger.info('received openai.Image...')
        if n == 1: return resp["data"][0]
        return resp["data"]

In [89]:
@retrying.retry(stop_max_attempt_number=2, wait_fixed=5000)
def get_image_from_text(sentence):
    chat = Chat(f'''Given
the following sentence in a script, write a plaintext and concise description of an
image to display while this script is read.
Only write the short description and nothing else.
Do not include specific numbers or the character names.'''.replace('\n', ' '))
    hd_modifiers = """3840x2160
8k 3D / 16k 3D
8k resolution / 16k resolution
Detailed
Ultra HD
Ultrafine detail""".split('\n')
    art_styles = [x.strip() for x in open('art_styles.txt').readlines()]
    art_styles += ['Psychedelic painting', 'Educational 3d illustration']
    description = chat.message(sentence)
    prompt = f'{random.choice(art_styles)} of {description} {random.choice(hd_modifiers)}'
    img = AIImage.create(prompt)
    return sentence, img, prompt

def get_images_from_text(text):
    sentences = text.split('\n')

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
        runs = []
        for sentence in sentences:
            if not sentence:
                continue
            runs.append(tpe.submit(get_image_from_text, sentence))
        images = [None] * len(runs)
        for r in concurrent.futures.as_completed(runs):
            ridx = runs.index(r)
            images[ridx] = r.result()
    return images

In [44]:
# images = get_images_from_text(paper_prereqs_script)
# len(images)

In [68]:
# for txt, img, prompt in images:
#     img = PIL.Image.open(io.BytesIO(base64.b64decode(img["b64_json"])))
#     IPython.display.display(txt)
#     IPython.display.display(prompt)
#     IPython.display.display(img)

# 5. Audio: Script to Speech

In [69]:
def speaker_sentence(sentence, names):
    for name in names:
        if sentence.startswith(f'{name}:'):
            return name, sentence[len(f'{name}:')+1:]
    return names[0], sentence

def script2speech(sentences, names, voices):
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
        jobs = []
        for sentence in sentences:
            speaker, sentence = speaker_sentence(sentence, names)
            jobs.append(tpe.submit(voices[names.index(speaker)].tts, sentence))
        audios = [b''] * len(jobs)
        for future in concurrent.futures.as_completed(jobs):
            idx = jobs.index(future)
            audios[idx] = future.result()
    return audios

In [46]:
# audios = script2speech([x[0] for x in images], SPEAKER_NAMES, SPEAKER_VOICES)

In [47]:
# IPython.display.Audio(audios[2])

# 6. Join audio and video

In [70]:
def concatenate_videos(directory, output_file):
    # get list of video files in directory
    files = sorted([f for f in os.listdir(directory) if f.endswith(".mp4")], key=lambda x: int(x.split('.mp4')[0].split('_')[1]))

    # create a file that contains the list of all video files
    filenames_f = f'{directory}/_files.txt'
    with open(filenames_f, 'w') as f:
        for video_file in files:
            f.write(f"file '{directory}/{video_file}'\n")

    # concatenate all videos using FFmpeg
    command = f"ffmpeg -f concat -safe 0 -i {filenames_f} -c copy {output_file}"
    print(command)
    try:
        os.remove(output_file)
    except:
        pass
    subprocess.check_call(command, shell=True, stderr=subprocess.DEVNULL)


def process_one_clip(tmpdir, i, img, audio):
    # Convert audio bytes to pydub's AudioSegment
    audio_segment = AudioSegment.from_file(io.BytesIO(audio))
    audio_path = f'{tmpdir}/audio_{i}.mp3'
    audio_segment.export(audio_path)
    audio_segment = AudioSegment.from_file(audio_path)

    # Create an ImageClip for this image and audio, with duration matching the audio
    duration = len(audio_segment) / 1000.0  # AudioSegment.length is in milliseconds
    # Convert PIL Image to numpy array
    np_image = np.array(img)
    video_clip = ImageClip(np_image, duration=duration)
    video_clip.fps = 30
    video_clip = video_clip.set_audio(AudioFileClip(audio_path))
    video_clip.write_videofile(
        f"{tmpdir}/clip_{i:0>3}.mp4", codec='libx264', audio_codec='aac',
        temp_audiofile=f'temp-audio-{i}.m4a', remove_temp=True,
        verbose=False, logger=None,
    )

def create_video(images, audios, outpath):
    with tempfile.TemporaryDirectory() as tmpdir:
        process_one = functools.partial(process_one_clip, tmpdir)
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
            for i, _ in enumerate(tpe.map(process_one, range(len(images)), images, audios)):
                logger.info(f'Done with {i} / {len(images)}')
        # Concatenate all video clips
        concatenate_videos(tmpdir, outpath)

In [48]:
# outpath = '/Users/jong/Downloads/Cell_20230725/final_video.mp4'
# create_video([PIL.Image.open(io.BytesIO(base64.b64decode(img[1]["b64_json"]))) for img in images], audios, outpath)

In [100]:
class Runner:
    def __init__(self, paper_path, outpath):
        self.paper_path = paper_path
        self.outpath = outpath

    def run(self):
        self.paper_text = extract_text_from_pdf(self.paper_path)
        self.paper_sections = text_into_token_chunks(self.paper_text, smoothing=3)
        self.paper_prereqs = get_all_concepts(self.paper_sections)
        self.paper_script = get_entire_script(self.paper_prereqs, self.paper_sections)
    
        def process_one_part(script):
            images = get_images_from_text(script)
            audios = script2speech([x[0] for x in images], SPEAKER_NAMES, SPEAKER_VOICES)
            return images, audios

        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as tpe:
            runs = [tpe.submit(process_one_part, script) for script in self.paper_script]
            images, audios = [None] * len(runs), [None] * len(runs)
            for i, r in enumerate(concurrent.futures.as_completed(runs)):
                ridx = runs.index(r)
                imgs, auds = r.result()
                images[ridx], audios[ridx] = imgs, auds
                logger.info(f'Got images and audio for {i} / {len(runs)}')
    
        self.images = flatten_list(images)
        self.audios = flatten_list(audios)
        create_video([PIL.Image.open(io.BytesIO(base64.b64decode(img[1]["b64_json"]))) for img in self.images], self.audios, self.outpath)

In [112]:
'MULTI DIMENSIONAL SKILLS AND GENDER DIFFERENCES IN STEM MAJORS'.title()

'Multi Dimensional Skills And Gender Differences In Stem Majors'

In [110]:
paper_path = '/Users/jong/Downloads/papers/MULTI-DIMENSIONAL SKILLS AND GENDER DIFFERENCES IN STEM MAJORS.pdf'
outpath = (paper_path[:-4] + '.mp4').replace(' ', '_')
MAX_WORKERS = 8
runner = Runner(paper_path, outpath)
runner.run()

[2m2023-07-26 09:39:29[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:30[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:30[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:30[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:30[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:30[0m [[32m[1minfo     [0m] [1mrequesting openai...[0m
[2m2023-07-26 09:39:31[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:32[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:32[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:32[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:32[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:33[0m [[32m[1minfo     [0m] [1mreceived openai...[0m
[2m2023-07-26 09:39:33[0m 

In [111]:
print('\n'.join([speaker_sentence(x[0], SPEAKER_NAMES)[1] for x in runner.images]))

Hello everyone!
Indeed, hello! Today, we are going to talk about some prerequisite concepts needed to understand a scientific paper. These concepts are related to college major choices, STEM majors, latent math ability, non-cognitive skills, and gender differences.
Let's start with college major choices. When students enter college, they have to decide what area of study they want to specialize in. This decision is known as choosing a college major. It is an important decision because it determines the subjects they will focus on during their studies and their future career opportunities.
That's right, Alfred. And one popular type of college major is STEM majors. STEM stands for Science, Technology, Engineering, and Mathematics. Students who choose a STEM major are interested in these fields and want to pursue careers in areas like computer science, engineering, or biology.
Another crucial concept to understand is latent math ability. Latent math ability refers to someone's unexpressed