In [24]:
import os
import requests
import numpy as np
import pandas as pd
from zipfile import ZipFile

In [None]:
# Load the news articles 
path = "/Users/josecastanedaforno/Library/CloudStorage/OneDrive-ScientificNetworkSouthTyrol/Thesis/Datasets/MIND/SmallMind/MINDSmall_train/news.tsv"
df = pd.read_csv(path, sep='\t', header=None, names=["news_id", "category", "sub_category", "title", "abstract", "url", "title_entities", "abstract_entities"])
df = df[["news_id", "title", "abstract"]]
df.head()

Unnamed: 0,news_id,title,abstract
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,N19639,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
3,N53526,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
4,N38324,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."


In [None]:
# Generate the news embeddings - https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
from transformers import AutoModel, AutoTokenizer
import torch
import faiss

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        output = model(**inputs).last_hidden_state.mean(dim=1)
    return output.squeeze().numpy()

embeddings = [get_embedding(str(row["title"]) + " " + str(row["abstract"])) for _, row in df.iterrows()]

In [22]:
print(len(embeddings))      # Number of news
print(len(embeddings[1]))   # Embedding dimensions

51282
384


In [25]:
# Store embeddings in database FAISS
#  https://ai.meta.com/tools/faiss/ - https://github.com/facebookresearch/faiss/wiki/Getting-started

dimensions = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimensions)
index.add(np.array(embeddings))

In [26]:
# Use the embeddings to extract actual the most relevant news to the user query.
def retrieve_news(query, topk=3):
  query_embedding = get_embedding(query).reshape(1,-1)
  _, indices = index.search(query_embedding,topk)
  result = df.iloc[indices[0]]
  return result

In [29]:
# Use the LLM to interact with the user.
import ollama

def generate_response(query):
  retrieved_docs = retrieve_news(query)
  context = "\n".join(retrieved_docs["title"] + ": " + retrieved_docs["abstract"])
  prompt =  f"You are a helpful assistant. Answer the question based on the following retrieved news articles:\n{context}\n\nQuestion: {query}"
  print(prompt)
  response = ollama.chat(model='mistral', messages=[{"role": "user", "content": prompt}])
  return response['message']['content']


In [33]:
# Test the system 
query = "what are the latest sports news?"
response = generate_response(query)
print(response)

You are a helpful assistant. Answer the question based on the following retrieved news articles:
The Daily Twitter: 31 October 2019: News, links to articles, updates and more from DC area writers and national sports journalists
The Daily Twitter: 9 November 2019: News, links to articles, updates and more from DC area writers and national sports journalists
The Daily Twitter: 12 November 2019: News, links to articles, updates and more from DC area writers and national sports journalists

Question: what are the latest sports news?
 Based on the provided sources, The Daily Twitter posts from October 31st, November 9th, and November 12th, here are some of the latest sports news related to the DC area and national sports:

* On October 31st, there was news about the Washington Nationals' players visiting the White House. The Nationals won the World Series in 2019, and their visit to the White House took place on that date.
* Also on October 31st, a reporter shared a tweet from a DC-based sp

In [None]:
# Reporse without context - outdated
print(ollama.chat(model='mistral', messages=[{"role": "user", "content": "what are the latest sports news?"}])['message']['content'])


1. NBA Finals: Milwaukee Bucks defeat Phoenix Suns in Game 6 to win the NBA championship. Giannis Antetokounmpo was named Finals MVP.

2. Tokyo Olympics: USA wins gold in women's soccer with a 1-0 victory over Canada, marking their seventh consecutive Olympic gold medal. Simone Biles withdraws from several events due to mental health concerns but later returns and wins bronze in the balance beam final.

3. UEFA Euro 2020: Italy defeats England on penalties (3-2) to win the UEFA European Football Championship, marking their second triumph in the tournament.

4. Wimbledon Tennis Championships: Novak Djokovic wins his sixth men's singles title at Wimbledon, defeating Matteo Berrettini in straight sets. Ashleigh Barty defeats Karolína Plíšková to win her first Wimbledon title and second Grand Slam singles title overall.

5. Formula One: Max Verstappen wins the Hungarian Grand Prix, extending his lead atop the drivers' standings over Lewis Hamilton.

6. Premier League: Manchester City clinc

## Use the entity on the title to have a more 'context aware'

In [37]:
path = "/Users/josecastanedaforno/Library/CloudStorage/OneDrive-ScientificNetworkSouthTyrol/Thesis/Datasets/MIND/SmallMind/MINDSmall_train/news.tsv"
df2 = pd.read_csv(path, sep='\t', header=None, names=["news_id", "category", "sub_category", "title", "abstract", "url", "title_entities", "abstract_entities"])
df2 = df2[["news_id", "title", "abstract", "title_entities"]]
df2.head()

Unnamed: 0,news_id,title,abstract,title_entities
0,N55528,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","[{""Label"": ""Prince Philip, Duke of Edinburgh"",..."
1,N19639,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,[]
3,N53526,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",[]
4,N38324,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [None]:
import json

def extract_entity_text(entity_json):
  try:
    entities = json.loads(entity_json.replace("'", '"'))
    return " ".join([e["Label"] for e in entities])
  except:
    return ""