In [1]:
openai_api_key = ""

In [2]:
import os
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup
from langchain_openai import ChatOpenAI
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

def fetch_feed(url, main_tag, link_tag, title_tag, image_tag, image_attr, category, website, date_tag):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml-xml')
    entries = []

    for entry in soup.find_all(main_tag):
        title = entry.find(title_tag).text
        link = entry.find(link_tag).text
        date = entry.find(date_tag).text
        published = normalize_datetime_to_django_format(date)

        entries.append([title, link, category, website, published])

    return entries

def check_and_store_items(items, csv_filename):
    # Check if CSV file exists
    if os.path.exists(csv_filename):
        # Load existing data
        df_existing = pd.read_csv(csv_filename)
        # Create a set of existing links for fast lookup
        existing_links = set(df_existing['link'])
    else:
        # Initialize an empty DataFrame if the file doesn't exist
        df_existing = pd.DataFrame(columns=['title', 'link', 'category', 'website', 'published'])
        existing_links = set()

    # Filter out items that already exist in the CSV
    new_items = [item for item in items if item[1] not in existing_links]

    if new_items:
        # Convert the new items to a DataFrame
        df_new = pd.DataFrame(new_items, columns=['title', 'link', 'category', 'website', 'published'])
        # Append new data to existing data
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        # Save the combined data to the CSV file
        df_combined.to_csv(csv_filename, index=False)
        print(f"{len(new_items)} new items added to the CSV file.")
    else:
        print("No new items to add.")

rss_feed_details = [
    ('https://theathletic.com/team/liverpool/?rss=1', 'entry', 'id', 'title', 'link', 'href', 'Liverpool FC','The Athletic','published'),
    ('http://www.thisisanfield.com/feed/', 'item', 'link', 'title', 'enclosure', 'url', 'Liverpool FC','This is Anfield', 'pubDate'),
    ('http://www.theguardian.com/football/rss', 'item', 'link', 'title', 'media:content', 'url', 'Football', 'The Guardian', 'pubDate'),
    ('https://theathletic.com/premier-league/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic', 'published'),
    ('https://theathletic.com/soccer/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic','published'),
    ('https://theathletic.com/champions-league/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic', 'published'),
    ('https://www.autosport.com/rss/feed/f1', 'item', 'link', 'title', 'enclosure', 'url', 'Formula 1', 'Autosport', 'pubDate'),
    ('https://the-race.com/category/formula-1/feed/', 'item', 'link', 'title','media:content', 'url', 'Formula 1', 'The Race', 'pubDate'),
    ('https://aeon.co/feed.rss', 'item', 'link', 'title', None, None, 'Self Dev', "Aeon", 'pubDate'),
    ('https://psyche.co/feed', 'item', 'link', 'title', None,None, 'Self Dev', "Psyche", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Science.xml', 'item', 'link', 'title', 'media:content', 'url', 'Science & Technology', "New York Times", 'pubDate'),
    ('https://www.popsci.com/rss', 'item', 'link', 'title', 'image', 'url', 'Science & Technology', "Popular Science", 'pubDate'),
    ('http://www.smithsonianmag.com/rss/innovation/', 'item', 'link', 'title', 'enclosure', 'url', 'Science & Technology', "Smithsonian", 'pubDate'),
    ('http://www.smithsonianmag.com/rss/latest_articles/', 'item', 'link', 'title', 'enclosure', 'url', 'Science & Technology', "Smithsonian", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Travel.xml', 'item', 'link', 'title', 'media:content', 'url', 'Travel', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Style.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Technology.xml', 'item', 'link', 'title', 'media:content', 'url', 'Science & Technology', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Business.xml', 'item', 'link', 'title', 'media:content', 'url', 'Global News', "New York Times", 'pubDate'),
     ('http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'item', 'link', 'title', 'media:content', 'url', 'Global News', "New York Times", 'pubDate'),
     ('http://feeds.feedburner.com/dawn-news', 'item', 'link', 'title',  'media:content', 'url', 'Pakistan', "Dawn", 'pubDate'),
     ('https://feeds.feedburner.com/dawn-news-world', 'item', 'link', 'title',  'media:content', 'url', 'Global News', "Dawn", 'pubDate'),
    ('https://www.theverge.com/rss/reviews/index.xml', 'entry', 'id', 'title', None, None, 'Science & Technology', 'The Verge', 'published'),
    ('https://www.nytimes.com/wirecutter/rss/', 'item', 'link', 'title', 'description', 'src', 'Science & Technology', "New York Times Wirecutter", 'pubDate')
]

def normalize_datetime_to_django_format(dt_str):
    dt = parse(dt_str)
    return dt.strftime('%Y-%m-%d %H:%M:%S%z')

def fetch_feed_with_details(feed_details):
    return fetch_feed(*feed_details)

items = []
i = 0
for item in rss_feed_details:
    if (i%10 == 0):
        print(i, item)
    items.extend(fetch_feed_with_details(item))
    i+=1

check_and_store_items(items, "rss_file.csv")



0 ('https://theathletic.com/team/liverpool/?rss=1', 'entry', 'id', 'title', 'link', 'href', 'Liverpool FC', 'The Athletic', 'published')
10 ('http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', 'New York Times', 'pubDate')
20 ('http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'item', 'link', 'title', 'media:content', 'url', 'Global News', 'New York Times', 'pubDate')
No new items to add.


In [None]:


# Function to summarize text
def summarize_text(text, openai_api_key):
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
        api_key=openai_api_key, 
    )

    messages = [
        (
            "system",
            "You are a text processing model. Your task is to extract and return the main body of text from the provided content. Focus on the core text of the article, preserving its paragraph structure. Ignore any extraneous elements such as advertisements, navigation links, or non-article content. Do not exceed the length of 10 sentences.",
        ),
        ("human", text),
    ]
    summarized_text = llm.invoke(messages)
    return summarized_text

# Function to fetch full article content
def fetch_full_article(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch article at {url}: Status code {response.status_code}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    article_text = ' '.join(p.text for p in soup.find_all('p'))
    if not article_text:
        print(f"No content found at {url}")
    return article_text

def fetch_nyt(url):
    response = requests.get("https://r.jina.ai/"+url)
    return response.text

# Function to check if an article's summary already exists and process new items
def check_and_store_summarized_items(summarized_items, csv_filename, openai_api_key):
    if os.path.exists(csv_filename):
        df_existing = pd.read_csv(csv_filename)
        existing_links = set(df_existing['link'])
    else:
        df_existing = pd.DataFrame(columns=['title', 'link', 'summary', 'category', 'website', 'published'])
        existing_links = set()

    new_summarized_items = []
    i = 0
    for title, link, category, website, published in summarized_items:
        print("Summarizing article ", i)
        i+=1
        if "nytimes" not in link:
            if link not in existing_links:
                article_text = fetch_full_article(link)
                if article_text:
                    print(f"Proceeding to summarize text for {title}...")
                    text_object = summarize_text(article_text, openai_api_key)
                    sum = text_object.content
                    print(sum)
                else:
                    sum = "No summary available"
                
                summary = "This is an article by "+ website + ". " +sum 
                new_summarized_items.append([title, link, summary, category, website, published])
            else:
                print(f"Article '{title}' already summarized. Skipping...")
        else: 
            if link not in existing_links:
                article_text = fetch_nyt(link)
                if article_text:
                    print(f"Proceeding to summarize text for {title}...")
                    text_object = summarize_text(article_text, openai_api_key)
                    sum = text_object.content
                    print(sum)
                else:
                    sum = "No summary available"
                summary = "This is an article by "+ website + ". " +sum 
                new_summarized_items.append([title, link, summary, category, website, published])


    if new_summarized_items:
        # Convert the new summarized items to a DataFrame
        df_new = pd.DataFrame(new_summarized_items, columns=['title', 'link', 'summary', 'category', 'website', 'published'])
        # Append new data to existing data
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        # Save the combined data to the CSV file
        df_combined.to_csv(csv_filename, index=False)
        print(f"{len(new_summarized_items)} new summaries added to the CSV file.")
    else:
        print("No new summaries to add.")
    
    return new_summarized_items

summarized_items =  check_and_store_summarized_items(items, 'structured_documents.csv', openai_api_key)


In [4]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# New list to hold the structured documents
structured_documents = []

for item in summarized_items:
    title, link, summary, category, website, published = item
    metadata = {
        'source': link,
        'title': title,
        'description': summary,
        'category': category,
        'website': website,
        'published': published,
        'language': 'en-US'  # Assuming the language is English
    }
    # Create a Document instance
    document = Document(page_content=summary, metadata=metadata)
    structured_documents.append(document)

In [5]:


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(structured_documents)


In [6]:

#vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())
vectorstore = FAISS.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key))

In [7]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

retrieved_docs = retriever.invoke("Martin Zubimendi")

len(retrieved_docs)

6

In [8]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=openai_api_key)

In [9]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [10]:


template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use 10 sentences maximum and keep the answer as concise as possible.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke("Provide me all of the transfer news on Liverpool FC in bullets")
print(answer)

- Liverpool is interested in signing Valencia goalkeeper Giorgi Mamardashvili, with reports indicating active negotiations.
- A meeting has been scheduled to discuss Mamardashvili's potential transfer, which may include a loan agreement to ensure he remains a regular starter.
- Fábio Carvalho has completed his move from Liverpool to Brentford for a fee that could rise to £27.5 million.
- Carvalho spent last season on loan and attracted interest from multiple Premier League clubs, but Brentford met Liverpool’s valuation.
- Liverpool is considering an offer from Red Bull Salzburg for 19-year-old midfielder Bobby Clark, who is also sought after by other clubs.
- Virgil van Dijk noted there have been "no changes" to his contract situation, as players assess the club's direction under the new head coach.
- Van Dijk expressed confidence in the club's transfer market decisions and the need for new signings.
- Recent developments suggest Liverpool may not pursue a No. 6 this summer, following 

In [11]:
answer = rag_chain.invoke("Porivde me all of teh transfer news in bullets")
print(answer)

- Aaron Wan-Bissaka has joined West Ham on a seven-year contract after leaving Manchester United.
- Manchester United is preparing medicals for Matthijs de Ligt and Noussair Mazraoui from Bayern Munich.
- Newcastle is awaiting a response to their third bid for Marc Guéhi.
- Conor Gallagher's move to Atlético Madrid has stalled.
- Manchester United is pursuing Manuel Ugarte.
- Chelsea is looking for a player who can score 45 goals before schools return.
- A winger has been loaned to Olympiakos and Urawa Reds after joining from Bodø/Glimt 18 months ago.
- A playmaker of North Macedonian heritage made his senior debut for Juventus last season.
- Villarreal paid £4m to sign a well-travelled winger for Spain's Euro 2024 squad.
- Dominic Solanke has moved to Spurs, and Chelsea signed Pedro Neto from Wolves.


In [12]:
answer = rag_chain.invoke("latest pakistani news, in bullets")
print(answer)

- Pakistan's leaders, including President Zardari and PM Shehbaz Sharif, aim to bring economic stability as the country celebrates 77 years of independence.
- PM Shehbaz announced efforts to reduce inflation and electricity prices, along with a five-year economic program.
- At least 95 people were injured, and a child was killed due to celebratory aerial firing in Karachi on Independence Day.
- A total of 95 cases of aerial firing were reported across major hospitals in the city.
- Police arrested 14 suspects in Karachi's Central District and five in Korangi District related to the firing incidents.
- Tensions between Pakistan and Afghanistan escalated, with clashes reported, resulting in civilian and military casualties.
- Gen Munir emphasized the need for accurate information to maintain peace and good relations with Afghanistan.
- The army, along with the government, is committed to ensuring the security and welfare of the Baloch people.
- A remote-controlled bomb blast occurred at 

In [13]:
answer = rag_chain.invoke("I'd like to know more about: PM Shehbaz Sharif pledged economic stability and a five-year economic program for Pakistan")
print(answer)

Prime Minister Shehbaz Sharif has pledged to bring economic stability to Pakistan, particularly as the nation celebrates its independence. He emphasized the need for self-introspection to address past mistakes and called for a renewed effort to overcome current crises. During a flag-hoisting ceremony, he highlighted that hard work, honesty, and passion are crucial for the country's development. Sharif announced the launch of a comprehensive five-year economic program aimed at revamping the economy, reducing inflation, and lowering electricity prices. He also expressed a commitment to tackling issues such as electricity theft, which costs the country Rs500 billion annually. The government is working on providing low-cost electricity and discussing the use of locally sourced coal to reduce dependency on expensive imports. Additionally, the prime minister plans to implement reforms to enhance governance and stimulate export-led growth. Overall, his focus is on creating a sustainable econo