In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m286.7/295.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.3.1


In [None]:
import requests
import re
from bs4 import BeautifulSoup
from pypdf import PdfReader
from tqdm import tqdm
from pprint import pprint
import os

COPYRIGHT_SET = {68, 70, 73, 80, 165, 179, 319, 328}

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

def process_transcript(text):
    # Remove any additional copyright or note sections if necessary
    text = re.sub(r'Copyright.*?All Rights Reserved\.', '', text, flags=re.DOTALL)

    # Format the dialogue: remove excessive line breaks, ensure spacing between speakers
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    formatted_text = re.sub(r'([A-Za-z\s]+):', r'\n\1:', text)  # Ensure speaker names are on new lines
    formatted_text = re.sub(r'\n{2,}', '\n', formatted_text)  # Clean up multiple newlines
    formatted_text = re.sub(r'\n\s+([A-Za-z\s]+:)', r'\n\1', formatted_text)
    formatted_text = re.sub(r'[ \t]{2,}', ' ', formatted_text) # Clean up multiple spaces or tabs

    return formatted_text.strip()

def extract_text_from_web_transcript(content):
    try:
        unwanted_start = content.find(string=re.compile(r"Transcripts may contain a few typos"))
        unwanted_end = content.find('hr', class_='wp-block-separator')
        if unwanted_start and unwanted_end:
            # Find the index range of the unwanted part and remove it
            start_index = unwanted_start.parent
            for elem in start_index.find_next_siblings():
                if elem == unwanted_end:
                    break
                elem.extract()
            start_index.extract()
        text_content = content.get_text().split("Related")[0]

        formatted_text = re.sub(r"(?<=\n)([A-Za-z\s]+):", r"\1:", text_content)
        formatted_text = re.sub(r"\n{2,}", "\n\n", formatted_text).strip()
        return formatted_text
    except:
        return "transcript cannot be automatically extracted."

def get_all_transcript(url, pdf_folder="./content"):
    if not os.path.exists(pdf_folder):
        os.makedirs(pdf_folder)
    pattern = re.compile(r"#(\d+):")

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    links = soup.find_all("a", string=pattern)

    link_data = []
    for link in tqdm(links):
        text = link.get_text()
        href = link.get("href")
        match = pattern.search(text)
        link_id = int(match.group(1))

        if link_id not in COPYRIGHT_SET:
            resp = requests.get(href)
            if not href.endswith("pdf"):
                soup = BeautifulSoup(resp.text, "html.parser")
                content = soup.find("div", class_="entry-content")
                transcript = extract_text_from_web_transcript(content)
            else:
                filename = os.path.join(pdf_folder, href.split("/")[-1])
                with open(filename, "wb") as f:
                    f.write(resp.content)
                transcript = extract_text_from_pdf(filename)
                transcript = process_transcript(transcript)
        else:
            transcript = "not available due to copyright reasons"


        link_dict = {
            "id": link_id,
            "title": text,
            "url": href,
            "transcript": transcript
        }

        # Append the dictionary to the list
        link_data.append(link_dict)

    return link_data

In [None]:
transcripts_url = "https://tim.blog/2018/09/20/all-transcripts-from-the-tim-ferriss-show/"
link_data = get_all_transcript(transcripts_url)

100%|██████████| 687/687 [12:42<00:00,  1.11s/it]


In [None]:
import json

with open("data.json", "w") as f:
    json.dump(link_data, f)

In [None]:
import json

with open("/content/data.json", "rt") as f:
    link_data = json.load(f)

In [None]:
def chunk_text(text, chunk_size=700, overlap_size=20):
    # Tokenize the text into words
    words = text.split()
    chunks = []

    # Iterate over the text with overlap
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = words[start:end]
        chunks.append(' '.join(chunk))

        # Move the start forward, considering the overlap
        start += (chunk_size - overlap_size)

    return chunks

In [None]:
from tqdm import tqdm
chunked_data = []
for data in tqdm(link_data):
    chunks = chunk_text(data["transcript"])
    for id, chunk in enumerate(chunks):
        chunked_data.append({
            "id": data["id"],
            "title": data["title"],
            "url": data["url"],
            "chunk_id": f"{data['id']}_{id}",
            "chunk": chunk
        })
len(chunked_data)

100%|██████████| 687/687 [00:01<00:00, 568.48it/s]


16397

In [None]:
with open("/content/drive/MyDrive/data/chunked_data.json", "w") as f:
    json.dump(chunked_data, f)

In [None]:
%%capture
!pip install -U sentence-transformers

In [None]:
import json
from tqdm import tqdm

with open("/content/chunked_data.json", "rt") as f:
    docs = json.load(f)

In [None]:
len(docs)

16397

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-cos-v1", device="cuda")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
for doc in tqdm(docs):
    doc["embedding"] = model.encode(doc["chunk"]).tolist()

100%|██████████| 16397/16397 [12:51<00:00, 21.24it/s]


In [None]:
with open("/content/drive/MyDrive/data/chunked_embedded_data.json", "w") as f:
    json.dump(docs, f)

In [None]:
len(docs[0]["embedding"])

768