# Keep Up  With The Trends

**Installation steps with conda**

1. conda create -n kuwtt pyton=3.10.14
2. pip install -r requirements.txt 
3. pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu118
4. [Download](https://ollama.com/download), Install & Run Ollama

In [1]:
# verify that CUDA is available

import torch

cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")

CUDA available: True


In [2]:
# Import necessary libraries

import time
import json
import re
import requests
import pandas as pd
import xmltodict
from tqdm.notebook import tqdm
from duckduckgo_search import DDGS
from pprint import pprint

## Scraping Trends Data

In [3]:
def fetch_and_parse_xml(url):
    try:
        # Fetch the XML data
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        xml_data = response.content

        # Parse the XML data and convert it to a dictionary
        data_dict = xmltodict.parse(xml_data)
        return data_dict

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the XML data: {e}")
    except xmltodict.expat.ExpatError as e:
        print(f"Error parsing the XML data: {e}")

In [4]:
# URL for the XML data
rss_xml_url_daily = 'https://trends.google.fr/trends/trendingsearches/daily/rss?geo=US'
rss_xml_url_realtime = 'https://trends.google.com/trending/rss?geo=US'

# Fetch, parse, and print the XML data
trends_dict = fetch_and_parse_xml(rss_xml_url_realtime)

pprint(trends_dict)

{'rss': {'@version': '2.0',
         '@xmlns:atom': 'http://www.w3.org/2005/Atom',
         '@xmlns:ht': 'https://trends.google.com/trending/rss',
         'channel': {'atom:link': {'@href': 'https://trends.google.com/trending/rss?geo=US',
                                   '@rel': 'self',
                                   '@type': 'application/rss+xml'},
                     'description': 'Recent searches',
                     'item': [{'description': None,
                               'ht:approx_traffic': '2000+',
                               'ht:news_item': [{'ht:news_item_picture': 'https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcRMRAO3Rs1UNQ0C7TFDSOBMiTkvfFLBw10cTCtQk51NgX4-y9nVkmvbEXLw3mw',
                                                 'ht:news_item_snippet': None,
                                                 'ht:news_item_source': 'Marca.com',
                                                 'ht:news_item_title': 'El '
                                        

In [5]:
# create google trends dataframe

google_trends_dict = {"trend_kws":[], "traffic":[], "pubDate":[], "url":[], "title":[]}

for trend in trends_dict['rss']['channel']['item']:
    google_trends_dict['trend_kws'].append(trend['title'])
    google_trends_dict['traffic'].append(trend['ht:approx_traffic'])
    google_trends_dict['pubDate'].append(trend['pubDate'])
    google_trends_dict['url'].append([news_item['ht:news_item_url'] for news_item in trend['ht:news_item']])
    google_trends_dict['title'].append([news_item['ht:news_item_title'] for news_item in trend['ht:news_item']])

google_trends_df = pd.DataFrame(google_trends_dict)

In [6]:
# clean data

google_trends_df["title"] = google_trends_df["title"].map(lambda links: [link.replace("&#39;", "'") for link in links])

In [7]:
google_trends_df.head()

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,athletic - atlético madrid,2000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.marca.com/futbol/futbol-internaci...,[El curioso equipo que jugará la FA Cup con el...
1,brock bowers,2000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://raiderswire.usatoday.com/2024/08/29/a...,[Antonio Pierce gives update on Brock Bowers i...
2,stetson bennett,1000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.latimes.com/sports/rams/story/202...,"[Ready or not, Stetson Bennett will make his N..."
3,kj bolden,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.dawgnation.com/football/kj-bolden...,"[While hype around KJ Bolden has cooled, Kirby..."
4,ladd mcconkey,1000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.espn.com/fantasy/football/story/_...,[Don't be surprised if ... : Fantasy predictio...


In [8]:
# define the list of domains to skip

domains_skip_list = ["msn.com", "washingtonpost.com"]

In [9]:
# remove urls that contain domains in domains_skip_list

for i in range(google_trends_df.shape[0]):
    for j, url in enumerate(google_trends_df.loc[i,"url"]):
        if any(domain in url for domain in domains_skip_list):
            _ = google_trends_df.loc[i,"url"].pop(j)
            _ = google_trends_df.loc[i,"title"].pop(j)

In [21]:
google_trends_df.head(20)

Unnamed: 0,trend_kws,traffic,pubDate,url,title
0,athletic - atlético madrid,2000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.marca.com/futbol/futbol-internaci...,[El curioso equipo que jugará la FA Cup con el...
1,brock bowers,2000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://raiderswire.usatoday.com/2024/08/29/a...,[Antonio Pierce gives update on Brock Bowers i...
2,stetson bennett,1000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.latimes.com/sports/rams/story/202...,"[Ready or not, Stetson Bennett will make his N..."
3,kj bolden,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.dawgnation.com/football/kj-bolden...,"[While hype around KJ Bolden has cooled, Kirby..."
4,ladd mcconkey,1000+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.espn.com/fantasy/football/story/_...,[Don't be surprised if ... : Fantasy predictio...
5,kendall milton,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.cincyjungle.com/2024/8/31/2423263...,[Bengals News (8/31): Kendall Milton a Practic...
6,will shipley,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://theeagleswire.usatoday.com/2024/08/29...,[Eagles rookie RB Will Shipley switches jersey...
7,georgia football roster,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://ugawire.usatoday.com/2024/08/31/bigge...,[Who is the biggest player on Georgia football...
8,muchova,500+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.express.co.uk/sport/tennis/194261...,[US Open finalist gives tennis legend telling-...
9,indiana state football,200+,"Sat, 31 Aug 2024 09:10:00 -0700",[https://www.jconline.com/story/sports/college...,[Purdue football vs. Indiana State: Scout repo...


In [11]:
# get trends related news articles from ddg web search

trends_news = []
for trend_kw in tqdm(google_trends_df.trend_kws.to_list()):
    results = DDGS().news(keywords=trend_kw, region="wt-wt", safesearch="moderate", max_results=15)
    filtered_results = [res for res in results if not any(domain in res['url'] for domain in domains_skip_list)]
    filtered_results = list(map(lambda d: {'trend_kws':trend_kw, **d}, filtered_results[:5]))
    trends_news.extend(filtered_results)

  0%|          | 0/10 [00:00<?, ?it/s]

In [12]:
# create ddg news articles dataframe

trends_ddg_news_df = pd.DataFrame(trends_news)
trends_ddg_news_df.head(10)

Unnamed: 0,trend_kws,date,title,body,url,image,source
0,athletic - atlético madrid,2024-08-31T12:24:00+00:00,"Athletic Bilbao coach Valverde ready for ""ambi...",Athletic Bilbao coach Ernesto Valverde expects...,https://www.tribalfootball.com/article/soccer-...,https://livesport-ott-images.ssl.cdn.cra.cz/r1...,Tribal Football
1,athletic - atlético madrid,2024-08-31T14:47:00+00:00,Diego Simeone Defends Atletico Madrid During T...,who was speaking ahead of their trip to face A...,https://www.outlookindia.com/sports/football/d...,https://media.assettype.com/outlookindia/2024-...,Outlookindia
2,athletic - atlético madrid,2024-08-30T17:49:00+00:00,Atletico boss Simeone in terse mood before Bil...,Aug 30 (Reuters) - Atletico Madrid manager Die...,https://www.reuters.com/sports/soccer/atletico...,,Reuters
3,athletic - atlético madrid,2024-08-28T22:56:00+00:00,Atletico Madrid struggles to find the net in 0...,Atletico Madrid couldn't find the net from 25 ...,https://apnews.com/article/atletico-madrid-esp...,,Associated Press
4,athletic - atlético madrid,2024-08-30T16:30:00+00:00,'The team is complete' - Simeone pulls Atletic...,During the press conference on the eve of the ...,https://sports.yahoo.com/team-complete-simeone...,https://media.zenfs.com/en/sempremilan_article...,Yahoo! Sports


In [22]:
trends_ddg_news_df.head(10)

Unnamed: 0,trend_kws,date,title,body,url,image,source
0,athletic - atlético madrid,2024-08-31T12:24:00+00:00,"Athletic Bilbao coach Valverde ready for ""ambi...",Athletic Bilbao coach Ernesto Valverde expects...,https://www.tribalfootball.com/article/soccer-...,https://livesport-ott-images.ssl.cdn.cra.cz/r1...,Tribal Football
1,athletic - atlético madrid,2024-08-31T14:47:00+00:00,Diego Simeone Defends Atletico Madrid During T...,who was speaking ahead of their trip to face A...,https://www.outlookindia.com/sports/football/d...,https://media.assettype.com/outlookindia/2024-...,Outlookindia
2,athletic - atlético madrid,2024-08-30T17:49:00+00:00,Atletico boss Simeone in terse mood before Bil...,Aug 30 (Reuters) - Atletico Madrid manager Die...,https://www.reuters.com/sports/soccer/atletico...,,Reuters
3,athletic - atlético madrid,2024-08-28T22:56:00+00:00,Atletico Madrid struggles to find the net in 0...,Atletico Madrid couldn't find the net from 25 ...,https://apnews.com/article/atletico-madrid-esp...,,Associated Press
4,athletic - atlético madrid,2024-08-30T16:30:00+00:00,'The team is complete' - Simeone pulls Atletic...,During the press conference on the eve of the ...,https://sports.yahoo.com/team-complete-simeone...,https://media.zenfs.com/en/sempremilan_article...,Yahoo! Sports
5,brock bowers,2024-08-29T17:55:00+00:00,"Antonio Pierce feels ""really good"" about Brock...",Raiders rookie tight end Brock Bowers has miss...,https://sports.yahoo.com/antonio-pierce-feels-...,https://s.yimg.com/ny/api/res/1.2/99MQf5fIf2Aw...,Yahoo! Sports
6,brock bowers,2024-08-29T19:58:00+00:00,Raiders confident TE Brock Bowers (foot) will ...,The Las Vegas Raiders expect rookie tight end ...,https://www.djournal.com/sports/national/raide...,https://bloximages.newyork1.vip.townnews.com/d...,Daily Journal
7,brock bowers,2024-08-29T23:51:00+00:00,Raiders report: Rookie expected to play Week 1...,Brock Bowers has not practiced in two weeks be...,https://www.reviewjournal.com/sports/raiders/r...,https://www.reviewjournal.com/wp-content/uploa...,Las Vegas Review-Journal
8,brock bowers,2024-08-29T18:57:00+00:00,Raiders coach addresses Brock Bowers' status f...,Brock Bowers has not practiced for the Las Veg...,https://larrybrownsports.com/football/raiders-...,,Larry Brown Sports
9,brock bowers,2024-08-22T23:59:00+00:00,REPORT: 'Best/Worst-Case Scenario' for Raiders...,The Las Vegas Raiders added arguably the best ...,https://www.si.com/nfl/raiders/news/report-bes...,,Sports Illustrated


In [13]:
# update google trends dataframe with ddg news articles urls and titles

for i, trend in enumerate(google_trends_df.trend_kws):
    url_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['url'].to_list()
    title_list = trends_ddg_news_df[trends_ddg_news_df["trend_kws"]==trend]['title'].to_list()
    google_trends_df.loc[i, "url"].extend(url_list)
    google_trends_df.loc[i, "title"].extend(title_list)

In [14]:
google_trends_df.title[0]

['El curioso equipo que jugará la FA Cup con el antiguo escudo del Atlético',
 'Athletic - Atlético en directo | LaLiga EA Sports en vivo hoy',
 'Una final en agosto',
 'Athletic Bilbao coach Valverde ready for "ambitious" Atletico Madrid',
 'Diego Simeone Defends Atletico Madrid During Tense Press Conference',
 'Atletico boss Simeone in terse mood before Bilbao clash',
 'Atletico Madrid struggles to find the net in 0-0 draw with Espanyol in Spanish league',
 "'The team is complete' - Simeone pulls Atletico Madrid out of Bennacer race"]

In [15]:
google_trends_df.url[0]

['https://www.marca.com/futbol/futbol-internacional/2024/08/29/66d0852eca4741621a8b458b.html',
 'https://www.marca.com/futbol/laliga-ea-sports/athletic-atletico/2024/08/31/01_0101_20240831_174_175-directo.html',
 'https://as.com/futbol/primera/una-final-en-agosto-n/',
 'https://www.tribalfootball.com/article/soccer-laliga-athletic-bilbao-coach-valverde-ready-for-ambitious-atletico-madrid-897eb9f4-53dc-4621-9ace-09f272fdc9e9',
 'https://www.outlookindia.com/sports/football/diego-simeone-defends-atletico-madrid-during-tense-press-conference',
 'https://www.reuters.com/sports/soccer/atletico-boss-simeone-terse-mood-before-bilbao-clash-2024-08-30/',
 'https://apnews.com/article/atletico-madrid-espanyol-spanish-league-score-46eaa47c28290b7454ecc76ebdcf5d36',
 'https://sports.yahoo.com/team-complete-simeone-pulls-atletico-163000180.html']

## Performing RAG (Retrieval-Augmented Generation)

In [16]:
# import libraries for rag with langchain

from langchain_community.document_loaders import SeleniumURLLoader, UnstructuredURLLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.utils.function_calling import convert_to_openai_tool
from langchain_core.runnables import RunnableLambda

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [17]:
# define embedding model
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# define llm
llm = ChatOllama(
    model="llama3.1",
    temperature=0.2,
    keep_alive="2m",
    repeat_penalty=1.03,
    # other params...
)


# define prompt template
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are a helpful assistant that specializes in article summarization.
            Your task is to summarize a given text article and generate a title for it.
            If the provided article doesn't contain coherent and meaningful content, just return an empty response.
            """,
        ),
        ("human", "Article: {article}"),
    ]
)


# define output schema for llm
class SummmaryWithTitle(BaseModel):
    '''Article summary and title.'''
    title: str
    summary: str

dict_schema = convert_to_openai_tool(SummmaryWithTitle)
structured_output_llm = llm.with_structured_output(dict_schema)

  from tqdm.autonotebook import tqdm, trange


In [18]:
# define helper functions

def url_loader(url_list:list[str], loader_type:str="Unstructured"):
    if loader_type == "Unstructured":
        unstruct_loader = UnstructuredURLLoader(url_list)
        data = unstruct_loader.load()
    elif loader_type == "Selenium":
        sele_loader = SeleniumURLLoader(url_list)
        driver = sele_loader._get_driver()
        try:
            data = sele_loader.load()
        finally:
            driver.close()
            driver.quit()
    elif loader_type == "Base":
        header_template = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        base_loader = WebBaseLoader(url_list, header_template=header_template)
        data = base_loader.load()
    else:
        raise ValueError(f"Loader type {loader_type} not supported.")
    return data


def rec_splitter(url_doc_list:list):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=400,
                                                   chunk_overlap=50,
                                                   add_start_index=True)
    all_splits = text_splitter.split_documents(url_doc_list)
    return all_splits


def reteived_docs_parser(ret_doc_list:list):
    ret_article = "\n".join([doc.page_content for doc in ret_doc_list])
    ret_article = ret_article.replace('\n\n', '. ')
    ret_article_sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z][A-Z]\.)(?<=\.|\?|!|\n)\s*', ret_article)
    ret_article_meaningful_sentences = [s for s in ret_article_sentences if len(s.split()) > 5]
    meaningful_ret_article = '\n'.join(ret_article_meaningful_sentences)
    return meaningful_ret_article

reteived_docs_parser_runnable = RunnableLambda(reteived_docs_parser)

In [19]:
# define functions to run the rag pipeline

def run_rag_chain_once(trend, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):
    print("# Starting RAG pipeline")
    start = time.time()

    print("## Scraping Articles")
    df_trend = google_df[google_df["trend_kws"]==trend]
    url_list = df_trend['url'].iloc[0]
    print(url_list)
    try:
        url_docs = url_loader(url_list, "Base")
        if len(" ".join([doc.page_content for doc in url_docs])) < 5000:
            print("Using UnstructuredLoader")
            url_docs = url_loader(url_list, "Unstructured")
    except Exception as e:
        print("Exception BaseURLLoader:", e)
        print("Using UnstructuredLoader")
        url_docs = url_loader(url_list, "Unstructured")

        
    for doc in url_docs:
        if (doc.page_content == "") & (doc.metadata["source"] in ddg_df.url.to_list()):
            article_body_index = ddg_df['url'].to_list().index(doc.metadata["source"])
            doc.page_content += ddg_df['body'][article_body_index]
    scraping_checkpoint = time.time()
    scraping_dur = scraping_checkpoint - start

    print("## Creating FAISS vectorstore")
    splits_docs = rec_splitter(url_docs)
    faiss_db = FAISS.from_documents(splits_docs, embeddings_model)
    faiss_retriever = faiss_db.as_retriever(search_type="similarity",
                                search_kwargs={'k': 5})
    faiss_checkpoint = time.time()
    faiss_dur = faiss_checkpoint - scraping_checkpoint

    print("## Performing RAG")
    ret_query = '\n'.join(df_trend['title'].iloc[0])
    rag_chain = (faiss_retriever
                    | { "article": reteived_docs_parser_runnable }
                    | prompt_template
                    | structured_output_llm)
    rag_results = rag_chain.invoke(ret_query)
    end = time.time()
    chain_dur = end - faiss_checkpoint
    print(f"Scrape: {scraping_dur}, Faiss: {faiss_dur}, Chain: {chain_dur}")
    return rag_results


def run_rag_chain_loop(google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm):

    results={"Trend_kws":[], "Title":[], "Summary":[]}
    #trend_kws = ddg_news_df.trend_kws.value_counts().index.to_list()
    trend_kws = google_df.trend_kws.to_list()

    for i, trend_kw in tqdm(enumerate(trend_kws), total=len(trend_kws)):
        print(f"Trend number {i+1} of {len(trend_kws)}")
        if google_df[google_df["trend_kws"]==trend_kw]['url'].iloc[0]:
            rag_results = run_rag_chain_once(trend_kw, google_df, ddg_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)
        else:
            rag_results = {'title':trend_kw, 'summary':'No enough information yet!'}
        print(f"""### Trend Keyword: {trend_kw}\n ### Title: {rag_results['title']}\n ### Summary: {rag_results['summary']}\n\n""")
        results['Trend_kws'].append(trend_kw)
        results['Title'].append(rag_results['title'])
        results['Summary'].append(rag_results['summary'])
        
    return results

In [20]:
# test the rag pipeline

results = run_rag_chain_loop(google_trends_df, trends_ddg_news_df, embeddings_model, reteived_docs_parser_runnable, prompt_template, structured_output_llm)

  0%|          | 0/10 [00:00<?, ?it/s]

Trend number 1 of 10
# Starting RAG pipeline
## Scraping Articles
['https://www.marca.com/futbol/futbol-internacional/2024/08/29/66d0852eca4741621a8b458b.html', 'https://www.marca.com/futbol/laliga-ea-sports/athletic-atletico/2024/08/31/01_0101_20240831_174_175-directo.html', 'https://as.com/futbol/primera/una-final-en-agosto-n/', 'https://www.tribalfootball.com/article/soccer-laliga-athletic-bilbao-coach-valverde-ready-for-ambitious-atletico-madrid-897eb9f4-53dc-4621-9ace-09f272fdc9e9', 'https://www.outlookindia.com/sports/football/diego-simeone-defends-atletico-madrid-during-tense-press-conference', 'https://www.reuters.com/sports/soccer/atletico-boss-simeone-terse-mood-before-bilbao-clash-2024-08-30/', 'https://apnews.com/article/atletico-madrid-espanyol-spanish-league-score-46eaa47c28290b7454ecc76ebdcf5d36', 'https://sports.yahoo.com/team-complete-simeone-pulls-atletico-163000180.html']
## Creating FAISS vectorstore


  attn_output = torch.nn.functional.scaled_dot_product_attention(


## Performing RAG
Scrape: 19.020139455795288, Faiss: 1.300205945968628, Chain: 16.166972398757935
### Trend Keyword: athletic - atlético madrid
 ### Title: El curioso equipo que jugará la FA Cup con el antiguo escudo del Atlético
 ### Summary: The article discusses the upcoming match between Athletic and Atletico Madrid in the FA Cup, highlighting their past rivalry and Atletico's current struggles in LaLiga EA Sports. It also mentions the unique relationship between the two teams, with Athletic having a strong record against Atletico since Diego Simeone took over in 2011.


Trend number 2 of 10
# Starting RAG pipeline
## Scraping Articles
['https://raiderswire.usatoday.com/2024/08/29/antonio-pierce-update-brock-bowers-injury-status-las-vegas-raiders-season-opener-los-angeles-chargers/', 'https://www.bleacherreport.com/articles/10133037-fantasy-nfl-rumors-raiders-brock-bowers-giants-theo-johnson-eyed-for-major-roles', 'https://justblogbaby.com/posts/antonio-pierce-gives-key-injury-upda