feat: remove videos with biggest transcripts until char_cap met

Morriz · May 24, 2024 · 8a937b2 · 8a937b2
1 parent 38fd964
commit 8a937b2
Show file tree

Hide file tree

Showing 6 changed files with 232 additions and 208 deletions.
diff --git a/api/main.py b/api/main.py
@@ -16,6 +16,7 @@ def search_allsides(
     offset: int = 0,
     _: None = Depends(verify_apikey),
 ) -> List[Dict[str, str]]:
+    """Search the AllSides database for a partial name"""
     results = query_allsides(name, limit, offset)
     return results[offset:]
 
@@ -27,6 +28,7 @@ def search_mediabiasfactcheck(
     offset: int = 0,
     _: None = Depends(verify_apikey),
 ) -> List[Dict[str, str]]:
+    """Search the MediaBiasFactCheck database for a partial name"""
     results = query_mediabiasfactcheck(name, limit, offset)
     return results[offset:]
 
@@ -38,6 +40,7 @@ async def search_media(
     offset: int = 0,
     _: None = Depends(verify_apikey),
 ) -> List[Media]:
+    """Search the curated independent media database for a partial name"""
     results = await query_media(query, top_k=limit + offset)
     return results[offset:]
 
@@ -74,14 +77,14 @@ async def get_youtube_search(
             title="Max channels",
             description="Maximum number of channels that we want to match. Needed when no channels were provided.",
         ),
-    ] = 12,
+    ] = 5,
     max_videos_per_channel: Annotated[
         int,
         Query(
             title="Max videos per channel",
             description="The maximum number of videos per channel that we want from each channel search.",
         ),
-    ] = 3,
+    ] = 2,
     get_descriptions: Annotated[
         bool,
         Query(
@@ -119,7 +122,7 @@ async def get_youtube_search(
                 status_code=400,
                 detail='"max_channels" must be provided when no "channels" are set!',
             )
-    return await youtube_search(
+    results = await youtube_search(
         channels=channels,
         query=query,
         period_days=period_days,
@@ -129,6 +132,7 @@ async def get_youtube_search(
         get_transcripts=get_transcripts,
         char_cap=char_cap,
     )
+    return results
 
 
 @app.get("/youtube-transcripts", response_model=List[VideoTranscript])

diff --git a/api/youtube.py b/api/youtube.py
@@ -1,10 +1,12 @@
 import asyncio
+import datetime
 import json
 import time
 import urllib.parse
 from collections import namedtuple
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
+import dateparser
 import requests
 from aiohttp import ClientSession
 from fastapi import HTTPException
@@ -140,7 +142,7 @@ async def youtube_search(
     query: str = None,
     channels: str = None,
     period_days: int = 3,
-    max_channels: int = None,
+    max_channels: int = 5,
     max_videos_per_channel: int = 3,
     get_descriptions: bool = False,
     get_transcripts: bool = False,
@@ -184,16 +186,25 @@ async def youtube_search(
         res.extend(videos)
     print("Number of videos found: " + str(len(res)))
     if char_cap:
-        capped = []
-        for idx, video in enumerate(videos):
-            capped.append(video)
-            if len(json.dumps(capped)) > char_cap:
-                break
-            return videos[0 : idx - 1]
-
+        res = filter_by_char_cap(res, char_cap)
+        # res.sort(key=sort_by_publish_time)
     return res
 
 
+def filter_by_char_cap(videos: List[Video], char_cap: int) -> List[Video]:
+    while len(json.dumps([vid.model_dump_json() for vid in videos])) > char_cap:
+        transcript_lengths = [len(video.transcript) for video in videos]
+        max_index = transcript_lengths.index(max(transcript_lengths))
+        videos.pop(max_index)
+    return videos
+
+
+def sort_by_publish_time(video: Video) -> float:
+    now = datetime.datetime.now()
+    d = dateparser.parse(video.publish_time, settings={"RELATIVE_BASE": now})
+    return time.mktime(d.timetuple())
+
+
 @cache(ttl=3600)
 def youtube_transcripts(
     ids: str,

diff --git a/bin/extract-openapi.py b/bin/extract-openapi.py
@@ -28,6 +28,14 @@
     print(f"importing app from {args.app}")
     app = import_from_string(args.app)
     openapi = app.openapi()
+    # clean up unwanted props that custom GPTs don't like:
+    for path in openapi["paths"]:
+        for method in openapi["paths"][path]:
+            if "parameters" in openapi["paths"][path][method]:
+                for param in openapi["paths"][path][method]["parameters"]:
+                    if "description" in param["schema"]:
+                        del param["schema"]["description"]
+
     version = openapi.get("openapi", "unknown version")
 
     print(f"writing openapi spec v{version}")