Skip to content

Commit

Permalink
feat: add timestamps to transcripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Morriz committed May 20, 2024
1 parent a1f5aaf commit 1712fe4
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 57 deletions.
40 changes: 0 additions & 40 deletions api/extract-openapi.py

This file was deleted.

13 changes: 9 additions & 4 deletions api/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, List

from fastapi import Depends, FastAPI
from fastapi import Depends, FastAPI, HTTPException

from api.store import (
Media,
Expand Down Expand Up @@ -50,7 +50,7 @@ async def search_media(

@app.get("/youtube", response_model=List[Video])
async def search_youtube(
query: str,
query: str = "",
period_days: int = 3,
max_channels: int = 8,
max_videos_per_channel: int = 3,
Expand All @@ -61,13 +61,18 @@ async def search_youtube(
) -> List[Video]:
tmp: Dict[str, List[Video]] = {}
if channels:
channels_arr = channels.split(",")
channels_arr = channels.lower().split(",")
media = [
item
for item in get_data()
if item["Youtube"].replace("https://www.youtube.com/", "") in channels_arr
if item["Youtube"].lower().replace("https://www.youtube.com/", "")
in channels_arr
]
else:
if query == "":
raise HTTPException(
status_code=400, detail="No query given when no channels are provided!"
)
media = await query_media(query, top_k=max_channels * 2)
for item in media:
if item["Youtube"] == "n/a":
Expand Down
24 changes: 16 additions & 8 deletions api/tools/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _parse_html_video(html: str) -> Dict[str, str]:
@cache(ttl=3600)
def search_youtube_channel(
channel_url: str,
search_terms: str,
query: str,
period_days: int,
max_results: int,
get_descriptions: bool = False,
Expand All @@ -134,17 +134,15 @@ def search_youtube_channel(
day = time.strftime("%d", time.localtime(start)).zfill(2)
month = time.strftime("%m", time.localtime(start)).zfill(2)
year = time.strftime("%Y", time.localtime(start))
encoded_search = urllib.parse.quote_plus(
f"{search_terms} after:{year}-{month}-{day}"
)
encoded_search = urllib.parse.quote_plus(f"{query} after:{year}-{month}-{day}")
url = f"{channel_url}/search?hl=en&query={encoded_search}"

html = ""
nothing = False
while "ytInitialData" not in html:
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to get search results for {search_terms} from {channel_url}")
print(f"Failed to get search results for {query} from {channel_url}")
nothing = True
break
html = response.text
Expand Down Expand Up @@ -181,6 +179,16 @@ def _get_video_info(video_id: str) -> Dict[str, str]:


def _get_video_transcript(video_id: str) -> str:
transcripts = YouTubeTranscriptApi.get_transcript(video_id)
transcript = ", ".join([t["text"] for t in transcripts])
return transcript
try:
transcripts = YouTubeTranscriptApi.get_transcript(
video_id, preserve_formatting=True
)
transcript = ", ".join(
[
str(t["start"]).split(".")[0] + "s" + ": " + t["text"]
for t in transcripts
]
)
return transcript
except:
return ""
40 changes: 35 additions & 5 deletions openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,17 @@ paths:
parameters:
- name: query
in: query
required: true
required: false
schema:
type: string
default: ''
title: Query
- name: period_days
in: query
required: false
schema:
type: integer
default: 1
default: 3
title: Period Days
- name: max_channels
in: query
Expand All @@ -178,6 +179,26 @@ paths:
type: integer
default: 3
title: Max Videos Per Channel
- name: channels
in: query
required: false
schema:
type: string
title: Channels
- name: get_descriptions
in: query
required: false
schema:
type: boolean
default: false
title: Get Descriptions
- name: get_transcripts
in: query
required: false
schema:
type: boolean
default: false
title: Get Transcripts
responses:
'200':
description: Successful Response
Expand Down Expand Up @@ -313,11 +334,11 @@ components:
title:
type: string
title: Title
long_desc:
short_desc:
anyOf:
- type: string
- type: 'null'
title: Long Desc
title: Short Desc
channel:
type: string
title: Channel
Expand All @@ -339,12 +360,21 @@ components:
url_suffix:
type: string
title: Url Suffix
long_desc:
anyOf:
- type: string
- type: 'null'
title: Long Desc
transcript:
anyOf:
- type: string
- type: 'null'
title: Transcript
type: object
required:
- id
- thumbnails
- title
- long_desc
- channel
- duration
- views
Expand Down

0 comments on commit 1712fe4

Please sign in to comment.