diff --git a/api/main.py b/api/main.py index 70eb808..1eae604 100644 --- a/api/main.py +++ b/api/main.py @@ -16,6 +16,7 @@ def search_allsides( offset: int = 0, _: None = Depends(verify_apikey), ) -> List[Dict[str, str]]: + """Search the AllSides database for a partial name""" results = query_allsides(name, limit, offset) return results[offset:] @@ -27,6 +28,7 @@ def search_mediabiasfactcheck( offset: int = 0, _: None = Depends(verify_apikey), ) -> List[Dict[str, str]]: + """Search the MediaBiasFactCheck database for a partial name""" results = query_mediabiasfactcheck(name, limit, offset) return results[offset:] @@ -38,6 +40,7 @@ async def search_media( offset: int = 0, _: None = Depends(verify_apikey), ) -> List[Media]: + """Search the curated independent media database for a partial name""" results = await query_media(query, top_k=limit + offset) return results[offset:] @@ -74,14 +77,14 @@ async def get_youtube_search( title="Max channels", description="Maximum number of channels that we want to match. Needed when no channels were provided.", ), - ] = 12, + ] = 5, max_videos_per_channel: Annotated[ int, Query( title="Max videos per channel", description="The maximum number of videos per channel that we want from each channel search.", ), - ] = 3, + ] = 2, get_descriptions: Annotated[ bool, Query( @@ -119,7 +122,7 @@ async def get_youtube_search( status_code=400, detail='"max_channels" must be provided when no "channels" are set!', ) - return await youtube_search( + results = await youtube_search( channels=channels, query=query, period_days=period_days, @@ -129,6 +132,7 @@ async def get_youtube_search( get_transcripts=get_transcripts, char_cap=char_cap, ) + return results @app.get("/youtube-transcripts", response_model=List[VideoTranscript]) diff --git a/api/youtube.py b/api/youtube.py index aed4a0e..3a6c403 100644 --- a/api/youtube.py +++ b/api/youtube.py @@ -1,10 +1,12 @@ import asyncio +import datetime import json import time import urllib.parse from collections import namedtuple -from typing import Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union +import dateparser import requests from aiohttp import ClientSession from fastapi import HTTPException @@ -140,7 +142,7 @@ async def youtube_search( query: str = None, channels: str = None, period_days: int = 3, - max_channels: int = None, + max_channels: int = 5, max_videos_per_channel: int = 3, get_descriptions: bool = False, get_transcripts: bool = False, @@ -184,16 +186,25 @@ async def youtube_search( res.extend(videos) print("Number of videos found: " + str(len(res))) if char_cap: - capped = [] - for idx, video in enumerate(videos): - capped.append(video) - if len(json.dumps(capped)) > char_cap: - break - return videos[0 : idx - 1] - + res = filter_by_char_cap(res, char_cap) + # res.sort(key=sort_by_publish_time) return res +def filter_by_char_cap(videos: List[Video], char_cap: int) -> List[Video]: + while len(json.dumps([vid.model_dump_json() for vid in videos])) > char_cap: + transcript_lengths = [len(video.transcript) for video in videos] + max_index = transcript_lengths.index(max(transcript_lengths)) + videos.pop(max_index) + return videos + + +def sort_by_publish_time(video: Video) -> float: + now = datetime.datetime.now() + d = dateparser.parse(video.publish_time, settings={"RELATIVE_BASE": now}) + return time.mktime(d.timetuple()) + + @cache(ttl=3600) def youtube_transcripts( ids: str, diff --git a/bin/extract-openapi.py b/bin/extract-openapi.py index 9a194d6..c502381 100755 --- a/bin/extract-openapi.py +++ b/bin/extract-openapi.py @@ -28,6 +28,14 @@ print(f"importing app from {args.app}") app = import_from_string(args.app) openapi = app.openapi() + # clean up unwanted props that custom GPTs don't like: + for path in openapi["paths"]: + for method in openapi["paths"][path]: + if "parameters" in openapi["paths"][path][method]: + for param in openapi["paths"][path][method]["parameters"]: + if "description" in param["schema"]: + del param["schema"]["description"] + version = openapi.get("openapi", "unknown version") print(f"writing openapi spec v{version}") diff --git a/openapi.yaml b/openapi.yaml index 35d7596..b2960ea 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -6,32 +6,33 @@ paths: /allsides: get: summary: Search Allsides + description: Search the AllSides database for a partial name operationId: search_allsides_allsides_get security: - - APIKeyQuery: [] - - APIKeyHeader: [] - - HTTPBearer: [] + - APIKeyQuery: [] + - APIKeyHeader: [] + - HTTPBearer: [] parameters: - - name: name - in: query - required: true - schema: - type: string - title: Name - - name: limit - in: query - required: false - schema: - type: integer - default: 5 - title: Limit - - name: offset - in: query - required: false - schema: - type: integer - default: 0 - title: Offset + - name: name + in: query + required: true + schema: + type: string + title: Name + - name: limit + in: query + required: false + schema: + type: integer + default: 5 + title: Limit + - name: offset + in: query + required: false + schema: + type: integer + default: 0 + title: Offset responses: '200': description: Successful Response @@ -53,32 +54,33 @@ paths: /mediabiasfactcheck: get: summary: Search Mediabiasfactcheck + description: Search the MediaBiasFactCheck database for a partial name operationId: search_mediabiasfactcheck_mediabiasfactcheck_get security: - - APIKeyQuery: [] - - APIKeyHeader: [] - - HTTPBearer: [] + - APIKeyQuery: [] + - APIKeyHeader: [] + - HTTPBearer: [] parameters: - - name: name - in: query - required: true - schema: - type: string - title: Name - - name: limit - in: query - required: false - schema: - type: integer - default: 5 - title: Limit - - name: offset - in: query - required: false - schema: - type: integer - default: 0 - title: Offset + - name: name + in: query + required: true + schema: + type: string + title: Name + - name: limit + in: query + required: false + schema: + type: integer + default: 5 + title: Limit + - name: offset + in: query + required: false + schema: + type: integer + default: 0 + title: Offset responses: '200': description: Successful Response @@ -100,32 +102,33 @@ paths: /media: get: summary: Search Media + description: Search the curated independent media database for a partial name operationId: search_media_media_get security: - - APIKeyQuery: [] - - APIKeyHeader: [] - - HTTPBearer: [] + - APIKeyQuery: [] + - APIKeyHeader: [] + - HTTPBearer: [] parameters: - - name: query - in: query - required: true - schema: - type: string - title: Query - - name: limit - in: query - required: false - schema: - type: integer - default: 5 - title: Limit - - name: offset - in: query - required: false - schema: - type: integer - default: 0 - title: Offset + - name: query + in: query + required: true + schema: + type: string + title: Query + - name: limit + in: query + required: false + schema: + type: integer + default: 5 + title: Limit + - name: offset + in: query + required: false + schema: + type: integer + default: 0 + title: Offset responses: '200': description: Successful Response @@ -145,85 +148,81 @@ paths: /media-videos: get: summary: Get Youtube Search - description: - Get the details of matching videos by either providing Youtube + description: Get the details of matching videos by either providing Youtube channels, a query, or both operationId: get_youtube_search_media_videos_get security: - - APIKeyQuery: [] - - APIKeyHeader: [] - - HTTPBearer: [] + - APIKeyQuery: [] + - APIKeyHeader: [] + - HTTPBearer: [] parameters: - - name: query - in: query - required: false - schema: - type: string - minLength: 3 - title: Query string - description: - Query string used to match independent news channels and do a - youtube search with in those channels. - example: israel - - name: channels - in: query - required: false - schema: - type: string - title: Channels to search in - description: A string of comma-separated Youtube channels to search in. - example: '@aljazeeraenglish,@DemocracyNow' - - name: period_days - in: query - required: false - schema: - type: integer - title: Period in days - default: 3 - description: The period in days since now that we want to search videos for. - - name: max_channels - in: query - required: false - schema: - type: integer - title: Max channels - default: 12 - description: - Maximum number of channels that we want to match. Needed when - no channels were provided. - - name: max_videos_per_channel - in: query - required: false - schema: - type: integer - title: Max videos per channel - default: 3 - description: - The maximum number of videos per channel that we want from each - channel search. - - name: get_descriptions - in: query - required: false - schema: - type: boolean - title: Get descriptions - default: false - description: Get the long descriptions for the videos. - - name: get_transcripts - in: query - required: false - schema: - type: boolean - title: Get transcripts - default: true - description: Get the transcripts for the videos. - - name: char_cap - in: query - required: false - schema: - type: integer - title: Max chars in the response - description: The maximum number of characters for the response. + - name: query + in: query + required: false + schema: + type: string + minLength: 3 + title: Query string + description: Query string used to match independent news channels and do a + youtube search with in those channels. + example: israel + - name: channels + in: query + required: false + schema: + type: string + title: Channels to search in + description: A string of comma-separated Youtube channels to search in. + example: '@aljazeeraenglish,@DemocracyNow' + - name: period_days + in: query + required: false + schema: + type: integer + title: Period in days + default: 3 + description: The period in days since now that we want to search videos for. + - name: max_channels + in: query + required: false + schema: + type: integer + title: Max channels + default: 12 + description: Maximum number of channels that we want to match. Needed when + no channels were provided. + - name: max_videos_per_channel + in: query + required: false + schema: + type: integer + title: Max videos per channel + default: 3 + description: The maximum number of videos per channel that we want from each + channel search. + - name: get_descriptions + in: query + required: false + schema: + type: boolean + title: Get descriptions + default: false + description: Get the long descriptions for the videos. + - name: get_transcripts + in: query + required: false + schema: + type: boolean + title: Get transcripts + default: true + description: Get the transcripts for the videos. + - name: char_cap + in: query + required: false + schema: + type: integer + title: Max chars in the response + description: The maximum number of characters for the response. responses: '200': description: Successful Response @@ -246,16 +245,16 @@ paths: description: Extract transcripts from a list of Youtube video ids operationId: get_youtube_transcripts_youtube_transcripts_get security: - - APIKeyQuery: [] - - APIKeyHeader: [] - - HTTPBearer: [] + - APIKeyQuery: [] + - APIKeyHeader: [] + - HTTPBearer: [] parameters: - - name: ids - in: query - required: true - schema: - type: string - title: Ids + - name: ids + in: query + required: true + schema: + type: string + title: Ids responses: '200': description: Successful Response @@ -323,38 +322,38 @@ components: title: X Bias: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Bias Profile: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Profile Factual: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Factual Credibility: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Credibility type: object required: - - Name - - Website - - Youtube - - About - - TrustFactors - - Topics - - Wikipedia - - X - - Bias - - Profile - - Factual - - Credibility + - Name + - Website + - Youtube + - About + - TrustFactors + - Topics + - Wikipedia + - X + - Bias + - Profile + - Factual + - Credibility title: Media description: Media model ValidationError: @@ -362,8 +361,8 @@ components: loc: items: anyOf: - - type: string - - type: integer + - type: string + - type: integer type: array title: Location msg: @@ -374,9 +373,9 @@ components: title: Error Type type: object required: - - loc - - msg - - type + - loc + - msg + - type title: ValidationError Video: properties: @@ -406,24 +405,24 @@ components: title: Url Suffix long_desc: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Long Desc transcript: anyOf: - - type: string - - type: 'null' + - type: string + - type: 'null' title: Transcript type: object required: - - id - - title - - short_desc - - channel - - duration - - views - - publish_time - - url_suffix + - id + - title + - short_desc + - channel + - duration + - views + - publish_time + - url_suffix title: Video description: Video model VideoTranscript: @@ -436,8 +435,8 @@ components: title: Text type: object required: - - id - - text + - id + - text title: VideoTranscript description: Video transcript model securitySchemes: diff --git a/requirements-prod.txt b/requirements-prod.txt index 47d5e1a..cbbff70 100644 --- a/requirements-prod.txt +++ b/requirements-prod.txt @@ -1,15 +1,16 @@ -munch -webdriver-manager +dateparser faiss-cpu fastapi llama_index llama-index-llms-openai llama-index-retrievers-bm25 llama-index-vector-stores-faiss +munch openai pyyaml rank-bm25 selenium streamlit uvicorn +webdriver-manager youtube-transcript-api \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 977ba9c..a223333 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ -munch==4.0.0 -webdriver-manager==4.0.1 +dateparser==1.2.0 faiss-cpu==1.7.4 fastapi==0.109.2 llama-index==0.10.11 llama-index-llms-openai==0.1.6 llama-index-retrievers-bm25==0.1.2 llama-index-vector-stores-faiss==0.1.1 +munch==4.0.0 openai==1.12.0 PyYAML==6.0.1 rank-bm25==0.2.2 selenium==4.21.0 streamlit==1.31.1 uvicorn==0.27.1 +webdriver-manager==4.0.1 youtube-transcript-api==0.6.2