# Keep Up  With The Trends

**Installation steps with conda**

1. conda create -n kuwtt pyton=3.10.14
2. pip install -r requirements.txt 
3. pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu118
4. [Download](https://ollama.com/download), Install & Run Ollama

In [30]:
# verify that CUDA is available

import torch

cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

CUDA available: True


In [31]:
# Import necessary libraries

import time
import json
import re
import requests
import pandas as pd
import xmltodict
from tqdm.notebook import tqdm
from duckduckgo_search import DDGS
from pprint import pprint

## Scraping Trends Data

In [73]:
def fetch_and_parse_xml(url):
    try:
        # Fetch the XML data
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        xml_data = response.content

        # Parse the XML data and convert it to a dictionary
        data_dict = xmltodict.parse(xml_data)
        return data_dict

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the XML data: {e}")
    except xmltodict.expat.ExpatError as e:
        print(f"Error parsing the XML data: {e}")

In [74]:
# URL for the XML data
rss_xml_url_daily = 'https://trends.google.fr/trends/trendingsearches/daily/rss?geo=US'
rss_xml_url_realtime = 'https://trends.google.com/trending/rss?geo=US'

# Fetch, parse, and print the XML data
trends_dict = fetch_and_parse_xml(rss_xml_url_realtime)

pprint(trends_dict)

{'rss': {'@version': '2.0',
         '@xmlns:atom': 'http://www.w3.org/2005/Atom',
         '@xmlns:ht': 'https://trends.google.com/trending/rss',
         'channel': {'atom:link': {'@href': 'https://trends.google.com/trending/rss?geo=US',
                                   '@rel': 'self',
                                   '@type': 'application/rss+xml'},
                     'description': 'Recent searches',
                     'item': [{'description': None,
                               'ht:approx_traffic': '500+',
                               'ht:news_item': [{'ht:news_item_picture': 'https://encrypted-tbn3.gstatic.com/images?q=tbn:ANd9GcTql8i2IYcESG8Y4vPdeoM3rlxw-Lkq9l9d0o7YudZ0-vo5KjfLni_wxA2O08I',
                                                 'ht:news_item_snippet': None,
                                                 'ht:news_item_source': 'NBC '
                                                                        'Sports',
                                          

In [75]:
# create google trends dataframe

google_trends_dict = {"trend_kws":[], "traffic":[], "pubDate":[], "url":[], "title":[]}

for trend in trends_dict['rss']['channel']['item']:
    google_trends_dict['trend_kws'].append(trend['title'])
    google_trends_dict['traffic'].append(trend['ht:approx_traffic'])
    google_trends_dict['pubDate'].append(trend['pubDate'])
    if isinstance(trend['ht:news_item'], list):
        google_trends_dict['url'].append([news_item['ht:news_item_url'] for news_item in trend['ht:news_item']])
        google_trends_dict['title'].append([news_item['ht:news_item_title'] for news_item in trend['ht:news_item']])
    else:
        google_trends_dict['url'].append([trend['ht:news_item']['ht:news_item_url']])
        google_trends_dict['title'].append([trend['ht:news_item']['ht:news_item_title']])

google_trends_df = pd.DataFrame(google_trends_dict)

In [76]:
# clean data

google_trends_df["title"] = google_trends_df["title"].map(lambda links: [link.replace("&#39;", "'") for link in links])

In [77]:
google_trends_df.head()

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,fantasy pros,500+,"Sun, 1 Sep 2024 09:40:00 -0700",[https://www.nbcsports.com/fantasy/football/ne...,[Fantasy Football 2024: Rotoworld staff most d...
1,nick kyrgios,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://www.express.co.uk/sport/tennis/194233...,[Carlos Alcaraz and Nick Kyrgios' DMs to each ...
2,zendaya,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://ew.com/marisa-tomei-talks-watching-to...,[Marisa Tomei says watching Tom Holland and Ze...
3,texas volleyball,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://www.jsonline.com/story/sports/college...,[What channel is Wisconsin volleyball vs. Texa...
4,presidential debates,1000+,"Sun, 1 Sep 2024 09:20:00 -0700",[https://www.msnbc.com/weekends-with-alex-witt...,[Middle class ‘wants to see the contrast’ betw...


In [78]:
# define the list of domains to skip

domains_skip_list = ["msn.com", "washingtonpost.com"]

In [79]:
# remove urls that contain domains in domains_skip_list

for i in range(google_trends_df.shape[0]):
    for j, url in enumerate(google_trends_df.loc[i,"url"]):
        if any(domain in url for domain in domains_skip_list):
            _ = google_trends_df.loc[i,"url"].pop(j)
            _ = google_trends_df.loc[i,"title"].pop(j)

In [80]:
google_trends_df.head(20)

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,fantasy pros,500+,"Sun, 1 Sep 2024 09:40:00 -0700",[https://www.nbcsports.com/fantasy/football/ne...,[Fantasy Football 2024: Rotoworld staff most d...
1,nick kyrgios,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://www.express.co.uk/sport/tennis/194233...,[Carlos Alcaraz and Nick Kyrgios' DMs to each ...
2,zendaya,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://ew.com/marisa-tomei-talks-watching-to...,[Marisa Tomei says watching Tom Holland and Ze...
3,texas volleyball,1000+,"Sun, 1 Sep 2024 09:30:00 -0700",[https://www.jsonline.com/story/sports/college...,[What channel is Wisconsin volleyball vs. Texa...
4,presidential debates,1000+,"Sun, 1 Sep 2024 09:20:00 -0700",[https://www.msnbc.com/weekends-with-alex-witt...,[Middle class ‘wants to see the contrast’ betw...
5,bayern,500+,"Sun, 1 Sep 2024 09:20:00 -0700",[https://onefootball.com/de/news/pre-match-pro...,[Pre-match programme for first home game again...
6,cabo san lucas,500+,"Sun, 1 Sep 2024 09:20:00 -0700",[https://www.areacucuta.com/los-cabos-el-parai...,[Los Cabos: El Paraíso Que Eligen Las Celebrid...
7,rublev,10000+,"Sun, 1 Sep 2024 09:10:00 -0700",[https://www.usopen.org/en_US/news/articles/20...,[Andrey Rublev and Grigor Dimitrov's bromance ...
8,lou holtz,200000+,"Sun, 1 Sep 2024 09:10:00 -0700",[https://www.espn.com/college-football/story/_...,"['It wasn't pretty,' but No. 7 Irish proud of ..."
9,delta airlines news,2000+,"Sun, 1 Sep 2024 09:10:00 -0700",[https://nypost.com/2024/08/31/us-news/man-kic...,[Man kicked off Delta flight for wearing Donal...


In [81]:
# get trends related news articles from ddg web search

trends_news = []
for trend_kw in tqdm(google_trends_df.trend_kws.to_list()):
    results = DDGS().news(keywords=trend_kw, region="wt-wt", safesearch="moderate", max_results=15)
    filtered_results = [res for res in results if not any(domain in res['url'] for domain in domains_skip_list)]
    filtered_results = list(map(lambda d: {'trend_kws':trend_kw, **d}, filtered_results[:5]))
    trends_news.extend(filtered_results)

  0%|          | 0/10 [00:00<?, ?it/s]

In [82]:
# create ddg news articles dataframe

trends_ddg_news_df = pd.DataFrame(trends_news)
trends_ddg_news_df.head(10)

Unnamed: 0,trend_kws,date,title,body,url,image,source
0,nick kyrgios,2024-09-01T00:37:00+00:00,Alexei Popyrin focused on next step at US Open...,Nick Kyrgios believes Alexei Popyrin has what ...,https://www.abc.net.au/news/2024-09-01/alexei-...,https://live-production.wcms.abc-cdn.net.au/b6...,Australian Broadcasting Corporation
1,nick kyrgios,2024-08-31T08:00:00+00:00,Nick Kyrgios rips Jannik Sinner with new bruta...,The Australian has once again harshly attacked...,https://www.tennisworldusa.org/tennis/news/Ten...,https://www.tennisworldusa.org/world/media/ima...,Tennis World
2,nick kyrgios,2024-08-30T14:14:00+00:00,Nick Kyrgios reveals DMs and close bond with C...,Nick Kyrgios has revealed the extent of his re...,https://www.mirror.co.uk/sport/tennis/nick-kyr...,https://i2-prod.mirror.co.uk/sport/article3356...,The Mirror
3,nick kyrgios,2024-08-29T22:06:00+00:00,Jannik Sinner responds to whether he will allo...,"Jannik Sinner admitted ""It's going to be diffe...",https://www.express.co.uk/sport/tennis/1942106...,https://cdn.images.express.co.uk/img/dynamic/7...,Daily Express
4,nick kyrgios,2024-08-30T12:45:00+00:00,Carlos Alcaraz and Nick Kyrgios' DMs to each o...,Carlos Alcaraz shocked the world as he was kno...,https://www.express.co.uk/sport/tennis/1942333...,https://cdn.images.express.co.uk/img/dynamic/7...,Daily Express
5,zendaya,2024-09-01T16:00:00+00:00,Jenna Ortega Once Auditioned for a Major Film ...,"Several years back, it seems Jenna Ortega and ...",https://www.yahoo.com/entertainment/jenna-orte...,,Yahoo
6,zendaya,2024-08-29T23:06:00+00:00,Marisa Tomei Recalls Watching Tom Holland & Ze...,The Academy Award winner had fond memories of ...,https://deadline.com/2024/08/marisa-tomei-watc...,https://deadline.com/wp-content/uploads/2024/0...,Deadline.com
7,zendaya,2024-08-28T16:02:00+00:00,Marisa Tomei says watching Tom Holland and Zen...,Marisa Tomei's spidey senses told her the roma...,https://nypost.com/2024/08/28/entertainment/ma...,https://nypost.com/wp-content/uploads/sites/2/...,New York Post
8,zendaya,2024-08-30T07:39:00+00:00,Marisa Tomei reflects on Tom Holland and Zenda...,Tomei recalled how she watched Holland and Zen...,https://timesofindia.indiatimes.com/entertainm...,https://static.toiimg.com/thumb/msid-112918747...,Indiatimes
9,zendaya,2024-08-31T19:21:00+00:00,Marisa Tomei recounts Tom Holland-Zendaya's bl...,"""I think some of the favorite things were real...",https://www.hindustantimes.com/htcity/cinema/m...,https://www.hindustantimes.com/ht-img/img/2024...,Hindustan Times


In [85]:
# update google trends dataframe with ddg news articles urls and titles

for i, trend in enumerate(google_trends_df.trend_kws):
    url_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['url'].to_list()
    title_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['title'].to_list()
    google_trends_df.loc[i, "url"].extend(url_list)
    google_trends_df.loc[i, "title"].extend(title_list)

In [86]:
google_trends_df.title[0]

['Fantasy Football 2024: Rotoworld staff most drafted players',
 '2024 Fantasy football draft strategy: Tiers, cheat sheets, best breakout \nplayers, deep sleepers by NFL expert',
 "Don't be surprised if ... : Fantasy predictions for 2024"]

In [87]:
google_trends_df.url[0]

['https://www.nbcsports.com/fantasy/football/news/fantasy-football-2024-rotoworld-staff-most-drafted-players',
 'https://www.cbssports.com/fantasy/football/news/2024-fantasy-football-draft-strategy-tiers-cheat-sheets-best-breakout-players-deep-sleepers-by-nfl-expert/',
 'https://www.espn.com/fantasy/football/story/_/id/41032720/2024-fantasy-football-rankings-draft-picks-predictions-surprises']

## Performing RAG (Retrieval-Augmented Generation)

In [88]:
# import libraries for rag with langchain

from langchain_community.document_loaders import SeleniumURLLoader, UnstructuredURLLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.runnables import RunnableLambda

In [89]:
# define embedding model
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# define llm
llm = ChatOllama(
    model="llama3.1",
    temperature=0.2,
    keep_alive="2m",
    repeat_penalty=1.03,
    # other params...
)


# define prompt template
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are a helpful assistant that specializes in article summarization.
            Your task is to summarize a given text article and generate a title for it.
            If the provided article doesn't contain coherent and meaningful content, just return an empty response.
            """,
        ),
        ("human", "Article: {article}"),
    ]
)


# define output schema for llm
class SummmaryWithTitle(BaseModel):
    '''Article summary and title.'''
    title: str
    summary: str

dict_schema = convert_to_openai_tool(SummmaryWithTitle)
structured_output_llm = llm.with_structured_output(dict_schema)

In [90]:
# define helper functions

def url_loader(url_list:list[str], loader_type:str="Unstructured"):
    if loader_type == "Unstructured":
        unstruct_loader = UnstructuredURLLoader(url_list)
        data = unstruct_loader.load()
    elif loader_type == "Selenium":
        sele_loader = SeleniumURLLoader(url_list)
        driver = sele_loader._get_driver()
        try:
            data = sele_loader.load()
        finally:
            driver.close()
            driver.quit()
    elif loader_type == "Base":
        header_template = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        base_loader = WebBaseLoader(url_list, header_template=header_template)
        data = base_loader.load()
    else:
        raise ValueError(f"Loader type {loader_type} not supported.")
    return data


def rec_splitter(url_doc_list:list):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                                   chunk_overlap=50,
                                                   add_start_index=True)
    all_splits = text_splitter.split_documents(url_doc_list)
    return all_splits


def reteived_docs_parser(ret_doc_list:list):
    ret_article = "\n".join([doc.page_content for doc in ret_doc_list])
    ret_article = ret_article.replace('\n\n', '. ')
    ret_article_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z][A-Z]\.)(?<=\.|\?|!|\n)\s*', ret_article)
    ret_article_meaningful_sentences = [s for s in ret_article_sentences if len(s.split()) > 5]
    meaningful_ret_article = '\n'.join(ret_article_meaningful_sentences)
    return meaningful_ret_article

reteived_docs_parser_runnable = RunnableLambda(reteived_docs_parser)

In [91]:
# define functions to run the rag pipeline

def run_rag_chain_once(trend, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):
    print("# Starting RAG pipeline")
    start = time.time()

    print("## Scraping Articles")
    df_trend = google_df[google_df["trend_kws"]==trend]
    url_list = df_trend['url'].iloc[0]
    print(url_list)
    try:
        url_docs = url_loader(url_list, "Base")
        if len(" ".join([doc.page_content for doc in url_docs])) < 5000:
            print("Using UnstructuredLoader")
            url_docs = url_loader(url_list, "Unstructured")
    except Exception as e:
        print("Exception BaseURLLoader:", e)
        print("Using UnstructuredLoader")
        url_docs = url_loader(url_list, "Unstructured")

        
    for doc in url_docs:
        if (doc.page_content == "") & (doc.metadata["source"] in ddg_df.url.to_list()):
            article_body_index = ddg_df['url'].to_list().index(doc.metadata["source"])
            doc.page_content += ddg_df['body'][article_body_index]
    scraping_checkpoint = time.time()
    scraping_dur = scraping_checkpoint - start

    print("## Creating FAISS vectorstore")
    splits_docs = rec_splitter(url_docs)
    faiss_db = FAISS.from_documents(splits_docs, embeddings_model)
    faiss_retriever = faiss_db.as_retriever(search_type="similarity",
                                search_kwargs={'k': 5})
    faiss_checkpoint = time.time()
    faiss_dur = faiss_checkpoint - scraping_checkpoint

    print("## Performing RAG")
    ret_query = '\n'.join(df_trend['title'].iloc[0])
    rag_chain = (faiss_retriever
                    | { "article": reteived_docs_parser_runnable }
                    | prompt_template
                    | structured_output_llm)
    rag_results = rag_chain.invoke(ret_query)
    end = time.time()
    chain_dur = end - faiss_checkpoint
    print(f"Scrape: {scraping_dur}, Faiss: {faiss_dur}, Chain: {chain_dur}")
    return rag_results


def run_rag_chain_loop(google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):

    results={"Trend_kws":[], "Title":[], "Summary":[]}
    #trend_kws = ddg_news_df.trend_kws.value_counts().index.to_list()
    trend_kws = google_df.trend_kws.to_list()

    for i, trend_kw in tqdm(enumerate(trend_kws), total=len(trend_kws)):
        print(f"Trend number {i+1} of {len(trend_kws)}")
        if google_df[google_df["trend_kws"]==trend_kw]['url'].iloc[0]:
            rag_results = run_rag_chain_once(trend_kw, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)
        else:
            rag_results = {'title':trend_kw, 'summary':'No enough information yet!'}
        print(f"""### Trend Keyword: {trend_kw}\n ### Title: {rag_results['title']}\n ### Summary: {rag_results['summary']}\n\n""")
        results['Trend_kws'].append(trend_kw)
        results['Title'].append(rag_results['title'])
        results['Summary'].append(rag_results['summary'])
        
    return results

In [92]:
# test the rag pipeline

results = run_rag_chain_loop(google_trends_df, trends_ddg_news_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)

  0%|          | 0/10 [00:00<?, ?it/s]

Trend number 1 of 10
# Starting RAG pipeline
## Scraping Articles
['https://www.nbcsports.com/fantasy/football/news/fantasy-football-2024-rotoworld-staff-most-drafted-players', 'https://www.cbssports.com/fantasy/football/news/2024-fantasy-football-draft-strategy-tiers-cheat-sheets-best-breakout-players-deep-sleepers-by-nfl-expert/', 'https://www.espn.com/fantasy/football/story/_/id/41032720/2024-fantasy-football-rankings-draft-picks-predictions-surprises']
## Creating FAISS vectorstore
## Performing RAG
Scrape: 6.709356307983398, Faiss: 0.35988926887512207, Chain: 18.945935249328613
### Trend Keyword: fantasy pros
 ### Title: 2024 Fantasy Football Draft Strategy and Rankings
 ### Summary: The article discusses the 2024 fantasy football draft strategy, including tiers, cheat sheets, best breakout players, and deep sleepers. It also provides fantasy football rankings for the 2024 season, advice, and picks. Additionally, it highlights bust risks at current ADP and provides offensive depth