In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from dotenv import load_dotenv
import os
from youtube_transcript_api.proxies import GenericProxyConfig
from data import data

In [2]:
load_dotenv()

True

In [3]:
def format_timestamp(seconds: float) -> str:
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)
    if hours == 0:
        return f"{minutes}:{secs:02}"
    return f"{hours}:{minutes:02}:{secs:02}"

def make_subtitles(transcript: str) -> str:

    lines = []
    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace("\n", " ")
        lines.append(ts + " " + text)

    return "\n".join(lines)

In [4]:
es = Elasticsearch(hosts="http://localhost:9200")
proxy_user = os.getenv("proxy_user")
proxy_pass = os.getenv("proxy_pass")
proxy_base_url = os.getenv("proxy_base_url")

In [5]:
proxy_url = f"http://{proxy_user}:{proxy_pass}@{proxy_base_url}"
ytt_api = YouTubeTranscriptApi(
    proxy_config=GenericProxyConfig(
        http_url=proxy_url,
        https_url=proxy_url
    )
)

In [6]:
index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_" 
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "light_english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                }
            },
            "analyzer": {
                "my_english_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase",
                               "english_stop",
                               "english_stemmer",
                               "english_possessive_stemmer"
                              ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {
                "type": "text",
                "analyzer": "my_english_analyzer",
                "search_analyzer": "my_english_analyzer"
            },
            "subtitles": {
                "type": "text",
                "analyzer": "my_english_analyzer",
                "search_analyzer": "my_english_analyzer"
            }
        }
    }
}           

In [7]:
def search_videos(query: str, index_name: str="podcasts", size: int = 5):
    """Search for videos by title or subtitle content."""
    body = {
        "size": size,
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["title^3", "subtitles"],
                "type": "best_fields",
                "analyzer": "my_english_analyzer"
            }
        },
        "highlight": {
            "pre_tags": ["*"],
            "post_tags": ["*"],
            "fields": {
                "title": {
                    "fragment_size": 150,
                    "number_of_fragments": 1
                },
                "subtitles": {
                    "fragment_size": 150,
                    "number_of_fragments": 3,
                    "order" : "score"
                }
            }
        }
    }
    
    response = es.search(index=f"{index_name}", body=body)
    hits = response.body['hits']['hits']
    
    results = []
    for hit in hits:
        highlight = hit['highlight']
        highlight['video_id'] = hit['_id']
        results.append(highlight)

    return results

In [8]:
index_name = "podcasts"
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    
es.indices.create(index=index_name, body=index_settings)
print(f"Index '{index_name}' created successfully")

Index 'podcasts' created successfully


In [9]:
for dt in data:
    _, video_id = dt['url'].split("watch?v=")
    dt['video_id'] = video_id

# Getting the data after configuring the proxy

In [10]:
for dt in tqdm(data):
    transcript = ytt_api.fetch(dt['video_id'])
    subtitles = make_subtitles(transcript)

    doc = {
        "video_id": dt['video_id'],
        "title": dt['title'],
        "subtitles": subtitles
    }

    if es.exists(index="podcasts", id=doc['video_id']):
        print(f"Index already exists for video {doc['video_id']}.")
        continue
    es.index(index="podcasts", id=doc['video_id'], document=doc)
    print(f"Indexed video: {doc['video_id']}")

  0%|          | 0/5 [00:00<?, ?it/s]

Indexed video: L7zWT3l3DV0
Indexed video: tool-R8VJ2Y
Indexed video: C-1Ukfaf7co
Indexed video: Yh1-y3TzSO4
Indexed video: hhhTWYDPAXI


In [11]:
results = search_videos(query="How to avoid trauma?")

In [12]:
print(results)

[{'subtitles': ['*How* do we all become better at\n27:39 managing our *trauma*? What does that look\n27:41 like in our day-to-day?', "So this is *how*\n12:37 *trauma* is passed on from one generation\n12:40 to the next not intentionally by the\n12:43 most loving parents cuz we can't help it", 'The word *trauma*\n5:50 comes from a Greek word for wound or\n5:51 wounding. So *trauma* is a wound.'], 'title': ['*How* to understand & heal your *trauma*: Gabor Maté, M.D. | mbg Podcast'], 'video_id': 'C-1Ukfaf7co'}, {'subtitles': ['Mate has completely\n2:24 transformed *how* the world sees, talks\n2:28 about, and understands *trauma*.', '*How* does unresolved *trauma* impact\n44:42 the way that you deal with stress as an\n44:44 adult?', 'So\n44:18 *how* do you like *how* does this sort of\n44:21 unresolved *trauma* from childhood that I\n44:25 would imagine you know a lot of us learn\n44:27 about'], 'title': ['Gabor Maté on *Trauma* & *How* to Heal'], 'video_id': 'tool-R8VJ2Y'}, {'subtitles': 