Skip to content

Commit

Permalink
feat: remove videos with biggest transcripts until char_cap met
Browse files Browse the repository at this point in the history
  • Loading branch information
Morriz committed May 24, 2024
1 parent 38fd964 commit 8a937b2
Show file tree
Hide file tree
Showing 6 changed files with 232 additions and 208 deletions.
10 changes: 7 additions & 3 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def search_allsides(
offset: int = 0,
_: None = Depends(verify_apikey),
) -> List[Dict[str, str]]:
"""Search the AllSides database for a partial name"""
results = query_allsides(name, limit, offset)
return results[offset:]

Expand All @@ -27,6 +28,7 @@ def search_mediabiasfactcheck(
offset: int = 0,
_: None = Depends(verify_apikey),
) -> List[Dict[str, str]]:
"""Search the MediaBiasFactCheck database for a partial name"""
results = query_mediabiasfactcheck(name, limit, offset)
return results[offset:]

Expand All @@ -38,6 +40,7 @@ async def search_media(
offset: int = 0,
_: None = Depends(verify_apikey),
) -> List[Media]:
"""Search the curated independent media database for a partial name"""
results = await query_media(query, top_k=limit + offset)
return results[offset:]

Expand Down Expand Up @@ -74,14 +77,14 @@ async def get_youtube_search(
title="Max channels",
description="Maximum number of channels that we want to match. Needed when no channels were provided.",
),
] = 12,
] = 5,
max_videos_per_channel: Annotated[
int,
Query(
title="Max videos per channel",
description="The maximum number of videos per channel that we want from each channel search.",
),
] = 3,
] = 2,
get_descriptions: Annotated[
bool,
Query(
Expand Down Expand Up @@ -119,7 +122,7 @@ async def get_youtube_search(
status_code=400,
detail='"max_channels" must be provided when no "channels" are set!',
)
return await youtube_search(
results = await youtube_search(
channels=channels,
query=query,
period_days=period_days,
Expand All @@ -129,6 +132,7 @@ async def get_youtube_search(
get_transcripts=get_transcripts,
char_cap=char_cap,
)
return results


@app.get("/youtube-transcripts", response_model=List[VideoTranscript])
Expand Down
29 changes: 20 additions & 9 deletions api/youtube.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import datetime
import json
import time
import urllib.parse
from collections import namedtuple
from typing import Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union

import dateparser
import requests
from aiohttp import ClientSession
from fastapi import HTTPException
Expand Down Expand Up @@ -140,7 +142,7 @@ async def youtube_search(
query: str = None,
channels: str = None,
period_days: int = 3,
max_channels: int = None,
max_channels: int = 5,
max_videos_per_channel: int = 3,
get_descriptions: bool = False,
get_transcripts: bool = False,
Expand Down Expand Up @@ -184,16 +186,25 @@ async def youtube_search(
res.extend(videos)
print("Number of videos found: " + str(len(res)))
if char_cap:
capped = []
for idx, video in enumerate(videos):
capped.append(video)
if len(json.dumps(capped)) > char_cap:
break
return videos[0 : idx - 1]

res = filter_by_char_cap(res, char_cap)
# res.sort(key=sort_by_publish_time)
return res


def filter_by_char_cap(videos: List[Video], char_cap: int) -> List[Video]:
while len(json.dumps([vid.model_dump_json() for vid in videos])) > char_cap:
transcript_lengths = [len(video.transcript) for video in videos]
max_index = transcript_lengths.index(max(transcript_lengths))
videos.pop(max_index)
return videos


def sort_by_publish_time(video: Video) -> float:
now = datetime.datetime.now()
d = dateparser.parse(video.publish_time, settings={"RELATIVE_BASE": now})
return time.mktime(d.timetuple())


@cache(ttl=3600)
def youtube_transcripts(
ids: str,
Expand Down
8 changes: 8 additions & 0 deletions bin/extract-openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
print(f"importing app from {args.app}")
app = import_from_string(args.app)
openapi = app.openapi()
# clean up unwanted props that custom GPTs don't like:
for path in openapi["paths"]:
for method in openapi["paths"][path]:
if "parameters" in openapi["paths"][path][method]:
for param in openapi["paths"][path][method]["parameters"]:
if "description" in param["schema"]:
del param["schema"]["description"]

version = openapi.get("openapi", "unknown version")

print(f"writing openapi spec v{version}")
Expand Down
Loading

0 comments on commit 8a937b2

Please sign in to comment.