# Keep Up  With The Trends
![diagram](./diagram_bg.png)

**Environment setup with conda**

1. conda create -n kuwtt pyton=3.10.14
2. pip install -r requirements.txt 
3. pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu118
4. [Download](https://ollama.com/download), Install & Run Ollama

In [1]:
# verify that CUDA is available

import torch

cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

CUDA available: True


In [2]:
# Import necessary libraries

import time
import json
import re
import requests
import pandas as pd
import xmltodict
from tqdm.notebook import tqdm
from duckduckgo_search import DDGS
from pprint import pprint

## Scraping Trends Data

In [3]:
def fetch_and_parse_xml(url):
    try:
        # Fetch the XML data
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        xml_data = response.content

        # Parse the XML data and convert it to a dictionary
        data_dict = xmltodict.parse(xml_data)
        return data_dict

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the XML data: {e}")
    except xmltodict.expat.ExpatError as e:
        print(f"Error parsing the XML data: {e}")

In [4]:
# URL for the XML data
rss_xml_url_daily = 'https://trends.google.fr/trends/trendingsearches/daily/rss?geo=US'
rss_xml_url_realtime = 'https://trends.google.com/trending/rss?geo=US'

# Fetch, parse, and print the XML data
trends_dict = fetch_and_parse_xml(rss_xml_url_realtime)

pprint(trends_dict)

{'rss': {'@version': '2.0',
         '@xmlns:atom': 'http://www.w3.org/2005/Atom',
         '@xmlns:ht': 'https://trends.google.com/trending/rss',
         'channel': {'atom:link': {'@href': 'https://trends.google.com/trending/rss?geo=US',
                                   '@rel': 'self',
                                   '@type': 'application/rss+xml'},
                     'description': 'Recent searches',
                     'item': [{'description': None,
                               'ht:approx_traffic': '5000+',
                               'ht:news_item': [{'ht:news_item_picture': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ1h_ieEvqcJ70OpU_9I_RnXZMad1eilv8JDPv7wLRnVNVhYuW5lmd5Wh0xdao',
                                                 'ht:news_item_snippet': None,
                                                 'ht:news_item_source': 'Marca.com',
                                                 'ht:news_item_title': 'Real '
                                      

In [5]:
# create google trends dataframe

google_trends_dict = {"trend_kws":[], "traffic":[], "pubDate":[], "url":[], "title":[]}

for trend in trends_dict['rss']['channel']['item']:
    google_trends_dict['trend_kws'].append(trend['title'])
    google_trends_dict['traffic'].append(trend['ht:approx_traffic'])
    google_trends_dict['pubDate'].append(trend['pubDate'])
    if isinstance(trend['ht:news_item'], list):
        google_trends_dict['url'].append([news_item['ht:news_item_url'] for news_item in trend['ht:news_item']])
        google_trends_dict['title'].append([news_item['ht:news_item_title'] for news_item in trend['ht:news_item']])
    else:
        google_trends_dict['url'].append([trend['ht:news_item']['ht:news_item_url']])
        google_trends_dict['title'].append([trend['ht:news_item']['ht:news_item_title']])

google_trends_df = pd.DataFrame(google_trends_dict)

In [6]:
# clean data

google_trends_df["title"] = google_trends_df["title"].map(lambda links: [link.replace("&#39;", "'") for link in links])

In [7]:
google_trends_df.head()

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,real madrid - betis,5000+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.marca.com/futbol/laliga-ea-sports...,[Real Madrid - Betis en directo | LaLiga EA Sp...
1,giancarlo stanton,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.espn.com/mlb/story/_/id/41078372/...,[Cardinals claim first win in Bronx since '64 ...
2,kyle manzardo,1000+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.cleveland.com/guardians/2024/09/g...,[Guardians activate veteran right-hander; reca...
3,dune,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://deadline.com/feature/venice-2024-movi...,[Venice Film Festival 2024: All Of Deadline’s ...
4,corinthians vs flamengo,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://colunadofla.com/2024/09/saiu-com-pedr...,"[SAIU! Com Pedro, Flamengo divulga escalação p..."


In [8]:
# define the list of domains to skip

domains_skip_list = ["msn.com", "nytimes.com"]

In [9]:
# remove urls that contain domains in domains_skip_list

for i in range(google_trends_df.shape[0]):
    for j, url in enumerate(google_trends_df.loc[i,"url"]):
        if any(domain in url for domain in domains_skip_list):
            _ = google_trends_df.loc[i,"url"].pop(j)
            _ = google_trends_df.loc[i,"title"].pop(j)

In [10]:
google_trends_df.head(20)

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,real madrid - betis,5000+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.marca.com/futbol/laliga-ea-sports...,[Real Madrid - Betis en directo | LaLiga EA Sp...
1,giancarlo stanton,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.espn.com/mlb/story/_/id/41078372/...,[Cardinals claim first win in Bronx since '64 ...
2,kyle manzardo,1000+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://www.cleveland.com/guardians/2024/09/g...,[Guardians activate veteran right-hander; reca...
3,dune,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://deadline.com/feature/venice-2024-movi...,[Venice Film Festival 2024: All Of Deadline’s ...
4,corinthians vs flamengo,500+,"Sun, 1 Sep 2024 11:30:00 -0700",[https://colunadofla.com/2024/09/saiu-com-pedr...,"[SAIU! Com Pedro, Flamengo divulga escalação p..."
5,losc vs psg,5000+,"Sun, 1 Sep 2024 11:20:00 -0700",[https://www.footmercato.net/a7758297516585232...,"[Le PSG frappe fort face au LOSC, Match : Les ..."
6,theegala,5000+,"Sun, 1 Sep 2024 11:20:00 -0700",[https://golf.com/news/the-rules-penalty-cost-...,[The rules penalty that may cost Sahith Theega...
7,haason reddick,1000+,"Sun, 1 Sep 2024 11:20:00 -0700",[https://www.si.com/nfl/jets/news/defensive-pl...,[Defensive Playmaker Is New York Jets Biggest ...
8,padres vs rays,2000+,"Sun, 1 Sep 2024 11:20:00 -0700",[https://nypost.com/2024/09/01/betting/padres-...,"[Padres vs. Rays prediction, odds: MLB picks, ..."
9,garrett crochet,500+,"Sun, 1 Sep 2024 11:20:00 -0700",[https://blogs.fangraphs.com/garrett-crochet-i...,[Garrett Crochet Is Considering Becoming a Cra...


In [11]:
# get trends related news articles from ddg web search

trends_news = []
for trend_kw in tqdm(google_trends_df.trend_kws.to_list()):
    results = DDGS().news(keywords=trend_kw, region="wt-wt", safesearch="moderate", max_results=15)
    filtered_results = [res for res in results if not any(domain in res['url'] for domain in domains_skip_list)]
    filtered_results = list(map(lambda d: {'trend_kws':trend_kw, **d}, filtered_results[:5]))
    trends_news.extend(filtered_results)

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
# create ddg news articles dataframe

trends_ddg_news_df = pd.DataFrame(trends_news)
trends_ddg_news_df.head(10)

Unnamed: 0,trend_kws,date,title,body,url,image,source
0,real madrid - betis,2024-09-01T16:30:00+00:00,La Liga Soccer Livestream: How to Watch Real M...,Kylian Mbappé goes in search of his first La L...,https://www.cnet.com/tech/services-and-softwar...,https://www.cnet.com/a/img/resize/01cb8ed0ac9d...,CNET
1,real madrid - betis,2024-09-01T16:38:00+00:00,Real Madrid vs Real Betis LIVE: Possible lineu...,Real Madrid vs Real Betis - Possible lineups R...,https://www.marca.com/en/soccer/laliga/r-madri...,https://e00-marca.uecdn.es/assets/multimedia/i...,MARCA
2,real madrid - betis,2024-09-01T12:43:00+00:00,Real Madrid vs Real Betis: Match preview,"After a disappointing draw way from home, Real...",https://sports.yahoo.com/real-madrid-vs-real-b...,https://s.yimg.com/ny/api/res/1.2/YrDgbIK6SHcV...,Yahoo! Sports
3,real madrid - betis,2024-09-01T06:47:00+00:00,Real Madrid Vs. Real Betis Line Up: Two Change...,Real Madrid boss Carlo Ancelotti will make two...,https://www.forbes.com/sites/tomsanderson/2024...,https://imageio.forbes.com/specials-images/ima...,Forbes
4,real madrid - betis,2024-09-01T17:53:00+00:00,Real Madrid Vs Real Betis Live Streaming: When...,Real Madrid will eye an improved showing again...,https://www.outlookindia.com/sports/football/r...,https://media.assettype.com/outlookindia/2024-...,Outlookindia
5,giancarlo stanton,2024-08-31T20:16:00+00:00,Giancarlo Stanton misses game-tying grand slam...,The Yanks have yet to have a signature comebac...,https://www.nydailynews.com/2024/08/31/yankees...,,New York Daily News
6,giancarlo stanton,2024-08-25T20:19:00+00:00,Umpire carted off field after getting hit in n...,Home plate umpire Nick Mahrley exited Sunday a...,https://www.cbssports.com/mlb/news/umpire-cart...,https://sportshub.cbsistatic.com/i/r/2024/08/2...,CBSSports.com
7,giancarlo stanton,2024-08-26T00:46:00+00:00,Umpire Nick Mahrley carted off after being hit...,The Yankees' 10-3 win over the Rockies took a ...,https://nypost.com/2024/08/25/sports/umpire-ca...,,New York Post
8,giancarlo stanton,2024-08-27T00:00:00+00:00,Daily Dinger: Best MLB Home Run Picks Today (M...,Here's who to bet to go yard on Tuesday. Ryan ...,https://www.si.com/betting/daily-dinger-best-m...,https://images2.minutemediacdn.com/image/uploa...,Sports Illustrated
9,giancarlo stanton,2024-08-26T02:39:00+00:00,VIDEO: Umpire Carted Off After Yankees Giancar...,A frightening moment occurred at Yankee Stadiu...,https://www.totalprosports.com/mlb/video-umpir...,,Total Pro Sports


In [13]:
# update google trends dataframe with ddg news articles urls and titles

for i, trend in enumerate(google_trends_df.trend_kws):
    url_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['url'].to_list()
    title_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['title'].to_list()
    google_trends_df.loc[i, "url"].extend(url_list)
    google_trends_df.loc[i, "title"].extend(title_list)

In [14]:
google_trends_df.title[0]

['Real Madrid - Betis en directo | LaLiga EA Sports hoy en vivo',
 'Septiembre trae una final',
 'Alineaciones del Real Madrid vs Betis',
 'La Liga Soccer Livestream: How to Watch Real Madrid vs. Real Betis From Anywhere',
 'Real Madrid vs Real Betis LIVE: Possible lineups and latest updates - LaLiga 24/25',
 'Real Madrid vs Real Betis: Match preview',
 'Real Madrid Vs. Real Betis Line Up: Two Changes From Ancelotti For Must Win',
 'Real Madrid Vs Real Betis Live Streaming: When And Where To Watch La Liga RMA Vs BET Match']

In [15]:
google_trends_df.url[0]

['https://www.marca.com/futbol/laliga-ea-sports/r-madrid-betis/2024/09/01/01_0101_20240901_186_185-directo.html',
 'https://as.com/futbol/primera/septiembre-trae-una-final-n/',
 'https://espndeportes.espn.com/futbol/espana/nota/_/id/14115008/real-madrid-vs-betis-alineaciones-laliga-jornada-4',
 'https://www.cnet.com/tech/services-and-software/la-liga-soccer-livestream-how-to-watch-real-madrid-vs-real-betis-from-anywhere/',
 'https://www.marca.com/en/soccer/laliga/r-madrid-betis/2024/09/01/01_0101_20240901_186_185-live.html',
 'https://sports.yahoo.com/real-madrid-vs-real-betis-124300136.html',
 'https://www.forbes.com/sites/tomsanderson/2024/09/01/real-madrid-vs-real-betis-line-up-two-changes-from-ancelotti-for-must-win/',
 'https://www.outlookindia.com/sports/football/real-madrid-vs-real-betis-live-streaming-when-and-where-to-watch-la-liga-rma-vs-bet-match']

## Performing RAG (Retrieval-Augmented Generation)

In [16]:
# import libraries for rag with langchain

from langchain_community.document_loaders import SeleniumURLLoader, UnstructuredURLLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.runnables import RunnableLambda

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [None]:
import nltk

nltk.download('punkt')  # needed for the UnstructuredURLLoader

In [17]:
# define embedding model
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# define llm
llm = ChatOllama(
    model="llama3.1",
    temperature=0.2,
    keep_alive="2m",
    repeat_penalty=1.03
)


# define prompt template
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are a helpful assistant that specializes in article summarization.
            Your task is to summarize a given text article and generate a title for it.
            If the provided article doesn't contain coherent and meaningful content, just return an empty response.
            """,
        ),
        ("human", "Article: {article}"),
    ]
)


# define output schema for llm
class SummmaryWithTitle(BaseModel):
    '''Article summary and title.'''
    title: str
    summary: str

dict_schema = convert_to_openai_tool(SummmaryWithTitle)
structured_output_llm = llm.with_structured_output(dict_schema)

  from tqdm.autonotebook import tqdm, trange


In [18]:
# define helper functions

def url_loader(url_list:list[str], loader_type:str="Unstructured"):
    if loader_type == "Unstructured":
        unstruct_loader = UnstructuredURLLoader(url_list)
        data = unstruct_loader.load()
    elif loader_type == "Selenium":
        sele_loader = SeleniumURLLoader(url_list)
        driver = sele_loader._get_driver()
        try:
            data = sele_loader.load()
        finally:
            driver.close()
            driver.quit()
    elif loader_type == "Base":
        header_template = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        base_loader = WebBaseLoader(url_list, header_template=header_template)
        data = base_loader.load()
    else:
        raise ValueError(f"Loader type {loader_type} not supported.")
    return data


def rec_splitter(url_doc_list:list):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                                   chunk_overlap=50,
                                                   add_start_index=True)
    all_splits = text_splitter.split_documents(url_doc_list)
    return all_splits


def reteived_docs_parser(ret_doc_list:list):
    ret_article = "\n".join([doc.page_content for doc in ret_doc_list])
    ret_article = ret_article.replace('\n\n', '. ')
    ret_article_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z][A-Z]\.)(?<=\.|\?|!|\n)\s*', ret_article)
    ret_article_meaningful_sentences = [s for s in ret_article_sentences if len(s.split()) > 5]
    meaningful_ret_article = '\n'.join(ret_article_meaningful_sentences)
    return meaningful_ret_article

reteived_docs_parser_runnable = RunnableLambda(reteived_docs_parser)

In [19]:
# define functions to run the rag pipeline

def run_rag_chain_once(trend, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):
    print("# Starting RAG pipeline")
    start = time.time()

    print("## Scraping Articles")
    df_trend = google_df[google_df["trend_kws"]==trend]
    url_list = df_trend['url'].iloc[0]
    print(url_list)
    try:
        url_docs = url_loader(url_list, "Base")
        if len(" ".join([doc.page_content for doc in url_docs])) < 5000:
            print("Using UnstructuredLoader")
            url_docs = url_loader(url_list, "Unstructured")
    except Exception as e:
        print("Exception BaseURLLoader:", e)
        print("Using UnstructuredLoader")
        url_docs = url_loader(url_list, "Unstructured")

        
    for doc in url_docs:
        if (doc.page_content == "") & (doc.metadata["source"] in ddg_df.url.to_list()):
            article_body_index = ddg_df['url'].to_list().index(doc.metadata["source"])
            doc.page_content += ddg_df['body'][article_body_index]
    scraping_checkpoint = time.time()
    scraping_dur = scraping_checkpoint - start

    print("## Creating FAISS vectorstore")
    splits_docs = rec_splitter(url_docs)
    faiss_db = FAISS.from_documents(splits_docs, embeddings_model)
    faiss_retriever = faiss_db.as_retriever(search_type="similarity",
                                search_kwargs={'k': 5})
    faiss_checkpoint = time.time()
    faiss_dur = faiss_checkpoint - scraping_checkpoint

    print("## Performing RAG")
    ret_query = '\n'.join(df_trend['title'].iloc[0])
    rag_chain = (faiss_retriever
                    | { "article": reteived_docs_parser_runnable }
                    | prompt_template
                    | structured_output_llm)
    rag_results = rag_chain.invoke(ret_query)
    end = time.time()
    chain_dur = end - faiss_checkpoint
    print(f"Scrape: {scraping_dur}, Faiss: {faiss_dur}, Chain: {chain_dur}")
    return rag_results


def run_rag_chain_loop(google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):

    results={"Trend_kws":[], "Title":[], "Summary":[]}
    #trend_kws = ddg_news_df.trend_kws.value_counts().index.to_list()
    trend_kws = google_df.trend_kws.to_list()

    for i, trend_kw in tqdm(enumerate(trend_kws), total=len(trend_kws)):
        print(f"Trend number {i+1} of {len(trend_kws)}")
        if google_df[google_df["trend_kws"]==trend_kw]['url'].iloc[0]:
            rag_results = run_rag_chain_once(trend_kw, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)
        else:
            rag_results = {'title':trend_kw, 'summary':'No enough information yet!'}
        print(f"""### Trend Keyword: {trend_kw}\n ### Title: {rag_results['title']}\n ### Summary: {rag_results['summary']}\n\n""")
        results['Trend_kws'].append(trend_kw)
        results['Title'].append(rag_results['title'])
        results['Summary'].append(rag_results['summary'])
        
    return results

In [20]:
# test the rag pipeline

results = run_rag_chain_loop(google_trends_df, trends_ddg_news_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)

  0%|          | 0/10 [00:00<?, ?it/s]

Trend number 1 of 10
# Starting RAG pipeline
## Scraping Articles
['https://www.marca.com/futbol/laliga-ea-sports/r-madrid-betis/2024/09/01/01_0101_20240901_186_185-directo.html', 'https://as.com/futbol/primera/septiembre-trae-una-final-n/', 'https://espndeportes.espn.com/futbol/espana/nota/_/id/14115008/real-madrid-vs-betis-alineaciones-laliga-jornada-4', 'https://www.cnet.com/tech/services-and-software/la-liga-soccer-livestream-how-to-watch-real-madrid-vs-real-betis-from-anywhere/', 'https://www.marca.com/en/soccer/laliga/r-madrid-betis/2024/09/01/01_0101_20240901_186_185-live.html', 'https://sports.yahoo.com/real-madrid-vs-real-betis-124300136.html', 'https://www.forbes.com/sites/tomsanderson/2024/09/01/real-madrid-vs-real-betis-line-up-two-changes-from-ancelotti-for-must-win/', 'https://www.outlookindia.com/sports/football/real-madrid-vs-real-betis-live-streaming-when-and-where-to-watch-la-liga-rma-vs-bet-match']
## Creating FAISS vectorstore


  attn_output = torch.nn.functional.scaled_dot_product_attention(


## Performing RAG
Scrape: 12.958088874816895, Faiss: 1.4378890991210938, Chain: 23.619611024856567
### Trend Keyword: real madrid - betis
 ### Title: Real Madrid Vs Real Betis Live Streaming: When And Where To Watch La Liga RMA Vs BET Match
 ### Summary: Here is how you can watch Real Madrid Vs Real Betis La Liga encounter live in India. La Liga broadcast rights in the UK are once again with Premier Sports, which will be showing a minimum of five live matches per week from Spain


Trend number 2 of 10
# Starting RAG pipeline
## Scraping Articles
['https://www.espn.com/mlb/story/_/id/41078372/cardinals-beat-yankees-first-win-bronx-64-world-series', 'https://sports.yahoo.com/cardinals-notch-first-yankee-stadium-win-since--the-1964-world-series-232613915.html', 'https://www.mlb.com/cardinals/news/kyle-gibson-pitches-gem-as-cardinals-beat-yankees', 'https://www.nydailynews.com/2024/08/31/yankees-cardinals-giancarlo-stanton-aaron-judge-aaron-boone-will-warren/', 'https://www.cbssports.com/m