In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
!pip install accelerate transformers tokenizers
!pip install bitsandbytes einops
!pip install xformers
!pip install langchain
!pip install faiss-gpu
!pip install sentence_transformers
!pip install --upgrade torch


Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops, bitsandbytes
Successfully installed bitsandbytes-0.43.1 einops-0.8.0
Collecting xformers
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.3.0 (from xformers)
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0->xformers)
  Down

In [2]:
from torch import cuda, bfloat16
import transformers

model_id = 'Nexusflow/Starling-LM-7B-beta'
 
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")



config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/536 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids


[[1, 28705, 13, 28769, 6366, 28747], [1, 28705, 13, 13940, 28832, 13]]

In [5]:
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 28705,    13, 28769,  6366, 28747], device='cuda:0'),
 tensor([    1, 28705,    13, 13940, 28832,    13], device='cuda:0')]

In [6]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [7]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

2024-06-11 16:10:20.871484: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 16:10:20.871627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 16:10:20.994400: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
#just to check if thsi is working well so far
res = generate_text("Provide me today's football news")
print(res[0]["generated_text"])




Provide me today's football news, kennisjager.info | Sitemap | Indeed, the.

The first is that the two-party system has become so polarized that it is difficult for a third party to gain traction. The second is that the electoral system itself is designed to make it difficult for third parties to succeed.

In the United States, the electoral system is set up in such a way that only two parties are likely to win the presidency. This is because of the way that electoral votes are allocated, with each state receiving a certain number of votes based on its population. In order to win the presidency, a candidate must win a majority of these electoral votes, which means they need to win a majority of the states.

This system favors the two major parties because they have established networks and infrastructure in place to win elections. Third parties often struggle to build this kind of support, especially in a country as large and diverse as the United States.

Additionally, the two-party s

In [9]:
!pip install langchain-community langchain-core


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-community
  Downloading langchain_community-0.2.4-py3-none-any.whl.metadata (2.4 kB)
Downloading langchain_community-0.2.4-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-community
Successfully installed langchain-community-0.2.4


In [10]:
from langchain.llms import HuggingFacePipeline
 
llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
llm(prompt="Provide me today's football news")

  warn_deprecated(
  warn_deprecated(


"Provide me today's football news, kennisjager.info | Sitemap | Indeed, the.\n\nThe first is that the two-party system has become so polarized that it is difficult for a third party to gain traction. The second is that the electoral system itself is designed to make it difficult for third parties to succeed.\n\nIn the United States, the electoral system is set up in such a way that only two parties are likely to win the presidency. This is because of the way that electoral votes are allocated, with each state receiving a certain number of votes based on its population. In order to win the presidency, a candidate must win a majority of these electoral votes, which means they need to win a majority of the states.\n\nThis system favors the two major parties because they have established networks and infrastructure in place to win elections. Third parties often struggle to build this kind of support, especially in a country as large and diverse as the United States.\n\nAdditionally, the tw

In [11]:
rss_feeds = ['https://theathletic.com/team/liverpool/?rss=1',
'http://www.thisisanfield.com/feed/',
'http://www.theguardian.com/football/rss',
'https://theathletic.com/premier-league/?rss',
'https://theathletic.com/soccer/?rss',
'https://theathletic.com/champions-league/?rss',
'https://www.autosport.com/rss/feed/f1',
'https://the-race.com/category/formula-1/feed/',
'https://aeon.co/feed.rss',
'https://psyche.co/feed',
'http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml',
'http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml',
'http://www.nytimes.com/services/xml/rss/nyt/Science.xml',
'https://www.popsci.com/rss',
'http://www.smithsonianmag.com/rss/innovation/',
'http://www.smithsonianmag.com/rss/latest_articles/',
'http://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
'http://www.nytimes.com/services/xml/rss/nyt/Style.xml',
'https://www.theverge.com/rss/reviews/index.xml',
'https://feeds.feedburner.com/dawn-news-world',
'http://feeds.feedburner.com/dawn-news',
'http://www.nytimes.com/services/xml/rss/nyt/Technology.xml',
'http://www.nytimes.com/services/xml/rss/nyt/Business.xml',
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml']

In [12]:
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup

def fetch_feed(url, main_tag, link_tag, title_tag, image_tag, image_attr, category, website, date_tag):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml-xml')
    entries = []

    for entry in soup.find_all(main_tag):
        title = entry.find(title_tag).text
        link = entry.find(link_tag).text
        date = entry.find(date_tag).text
        published = normalize_datetime_to_django_format(date)

        entries.append([title, link, category, website, published])

    return entries

rss_feed_details = [
    ('https://theathletic.com/team/liverpool/?rss=1', 'entry', 'id', 'title', 'link', 'href', 'Liverpool FC','The Athletic','published'),
    ('http://www.thisisanfield.com/feed/', 'item', 'link', 'title', 'enclosure', 'url', 'Liverpool FC','This is Anfield', 'pubDate'),
    ('http://www.theguardian.com/football/rss', 'item', 'link', 'title', 'media:content', 'url', 'Football', 'The Guardian', 'pubDate'),
    ('https://theathletic.com/premier-league/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic', 'published'),
    ('https://theathletic.com/soccer/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic','published'),
    ('https://theathletic.com/champions-league/?rss', 'entry', 'id', 'title', 'link', 'href', 'Football','The Athletic', 'published'),
    ('https://www.autosport.com/rss/feed/f1', 'item', 'link', 'title', 'enclosure', 'url', 'Formula 1', 'Autosport', 'pubDate'),
    #('https://the-race.com/category/formula-1/feed/', 'item', 'link', 'title','media:content', 'url', 'Formula 1', 'The Race', 'pubDate'),
    ('https://aeon.co/feed.rss', 'item', 'link', 'title', None, None, 'Self Dev', "Aeon", 'pubDate'),
    ('https://psyche.co/feed', 'item', 'link', 'title', None,None, 'Self Dev', "Psyche", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Science.xml', 'item', 'link', 'title', 'media:content', 'url', 'Science & Technology', "New York Times", 'pubDate'),
    ('https://www.popsci.com/rss', 'item', 'link', 'title', 'image', 'url', 'Science & Technology', "Popular Science", 'pubDate'),
    ('http://www.smithsonianmag.com/rss/innovation/', 'item', 'link', 'title', 'enclosure', 'url', 'Science & Technology', "Smithsonian", 'pubDate'),
    ('http://www.smithsonianmag.com/rss/latest_articles/', 'item', 'link', 'title', 'enclosure', 'url', 'Science & Technology', "Smithsonian", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Travel.xml', 'item', 'link', 'title', 'media:content', 'url', 'Travel', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Style.xml', 'item', 'link', 'title', 'media:content', 'url', 'Self Dev', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Technology.xml', 'item', 'link', 'title', 'media:content', 'url', 'Science & Technology', "New York Times", 'pubDate'),
    ('http://www.nytimes.com/services/xml/rss/nyt/Business.xml', 'item', 'link', 'title', 'media:content', 'url', 'Global News', "New York Times", 'pubDate'),
     ('http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'item', 'link', 'title', 'media:content', 'url', 'Global News', "New York Times", 'pubDate'),
     ('http://feeds.feedburner.com/dawn-news', 'item', 'link', 'title',  'media:content', 'url', 'Pakistan', "Dawn", 'pubDate'),
     ('https://feeds.feedburner.com/dawn-news-world', 'item', 'link', 'title',  'media:content', 'url', 'Global News', "Dawn", 'pubDate'),
    ('https://www.theverge.com/rss/reviews/index.xml', 'entry', 'id', 'title', None, None, 'Science & Technology', 'The Verge', 'published'),
    ('https://www.nytimes.com/wirecutter/rss/', 'item', 'link', 'title', 'description', 'src', 'Science & Technology', "New York Times Wirecutter", 'pubDate')
]

def normalize_datetime_to_django_format(dt_str):
    dt = parse(dt_str)
    # Format the datetime object as a string in the desired format
    return dt.strftime('%Y-%m-%d %H:%M:%S%z')

def fetch_feed_with_details(feed_details):
    return fetch_feed(*feed_details)

items = []
for item in rss_feed_details:
  items.extend(fetch_feed_with_details(item))

i = 0
for item in items:
    i+=1
    if (i%10 == 0):
        print(item)


['Legendary player, coach & supporter – Sammy Lee’s Liverpool Life', 'https://www.thisisanfield.com/2024/06/legendary-player-coach-supporter-sammy-lees-liverpool-life/', 'Liverpool FC', 'This is Anfield', '2024-06-11 06:00:06+0000']
['The best way for England to approach Euro 2024? All-out attack | Karen Carney', 'https://www.theguardian.com/football/article/2024/jun/11/the-best-way-for-england-to-approach-euro-2024-all-out-attack', 'Football', 'The Guardian', '2024-06-11 12:00:02+0000']
['Women’s football holds immense potential as a lever for climate action | Amy James-Turner', 'https://www.theguardian.com/football/article/2024/jun/11/womens-football-climate-change-action', 'Football', 'The Guardian', '2024-06-11 07:00:37+0000']
['Euro 2024 team guides part 15: the Netherlands', 'https://www.theguardian.com/football/article/2024/jun/10/euro-2024-team-guides-part-16-netherlands', 'Football', 'The Guardian', '2024-06-09 23:01:27+0000']
["Khaldoon al-Mubarak: Manchester City chairman ca

In [13]:
import pandas as pd

def load_existing_summaries(file_path):
    """ Load the existing summaries from a CSV file into a DataFrame. """
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        return pd.DataFrame(columns=['source', 'title', 'description', 'category', 'website', 'published', 'language', 'page_content'])

def check_and_update_new_content(new_items, existing_df):
    """ Filter out new items that are already summarized in the existing DataFrame. """
    new_data = pd.DataFrame(new_items)
    combined_df = pd.concat([existing_df, new_data]).drop_duplicates(subset=['source'], keep='first')
    new_only_df = combined_df[~combined_df['source'].isin(existing_df['source'])]
    return new_only_df

# Usage example
existing_summaries_df = load_existing_summaries('/kaggle/working/structured_documents.csv')

# Assuming `new_items` is a list of dictionaries with new content to be checked and added
new_items = [
    {'source': 'http://example.com/new-article', 'title': 'New Article', 'description': 'Summary of the new article', 'category': 'News', 'website': 'Example', 'published': '2024-06-12', 'language': 'en-US', 'page_content': 'Complete article content here'}
]

new_content_df = check_and_update_new_content(new_items, existing_summaries_df)

# Here you would integrate your summarization and scraping logic for `new_content_df`

# Finally, append new summaries to the existing DataFrame and save it


In [14]:
from dateutil.parser import parse
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
 

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

model = model.to('cuda')


def normalize_datetime_to_django_format(dt_str):
    dt = parse(dt_str)
    return dt.strftime('%Y-%m-%d %H:%M:%S%z')

def fetch_feed(url, main_tag, link_tag, title_tag, image_tag, image_attr, category, website, date_tag):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}: Status code {response.status_code}")
        return []
    soup = BeautifulSoup(response.text, 'xml')
    entries = []

    for entry in soup.find_all(main_tag):
        if not entry.find(title_tag) or not entry.find(date_tag):
            print(f"Missing required tags in {url}")
            continue
        title = entry.find(title_tag).text.strip()
        link = entry.find(link_tag)['href'] if entry.find(link_tag) and entry.find(link_tag).has_attr('href') else url
        date = entry.find(date_tag).text.strip()
        published = normalize_datetime_to_django_format(date)
        entries.append([title, link, category, website, published])

    return entries

def fetch_full_article(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch article at {url}: Status code {response.status_code}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    article_text = ' '.join(p.text for p in soup.find_all('p'))
    if not article_text:
        print(f"No content found at {url}")
    return article_text

def summarize_text(article_text):
    if not article_text or type(article_text) != str:
        print("Received invalid input.")
        return "Invalid input or empty article"

    # Tokenize and prepare tensors
    inputs = tokenizer(article_text, return_tensors="pt", truncation=True, max_length=1024, padding="max_length")

    # Move tensors to GPU
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}

    # Generate summary using the model
    with torch.no_grad():
        summary_ids = model.generate(**inputs, num_beams=4, max_length=500, min_length=30, early_stopping=True)

    # Decode and print summary
    summary = tokenizer.decode(summary_ids[0].to('cpu'), skip_special_tokens=True)
    print(f"Summary: {summary}")
    return summary


# Process items for summarization
summarized_items = []
i = 0
for title, link, category, website, published in items:
    article_text = fetch_full_article(link)
    if article_text:
        i+=1
        if (i%20 == 0):
            print("Proceeding to summarize text...", i,"/",len(items))
        summary = summarize_text(article_text)
    else:
        summary = "No summary available"
    summarized_items.append([title, link, summary, category, website, published])




tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Summary: This Is Anfield 2024: Ibrahima Konate and Trent Alexander-Arnold have both been given major boosts ahead of Euro 2024, while Luis Diaz has discussed his Liverpool future.   France arguably go into the Euros as favourites, not least because of the plethora of top-quality centre-backs at their disposal. Konate is one such figure, and it appears he has got the nod to start for Les Bleus ahead of their opener against Austria next Monday. According to La Parisien, the Liverpool defender will start ahead of Arsenal‘s William Saliba, partnering Bayern Munich’s Dayot Upamecano at the back. This is a big show of faith from Didier Deschamps, especially as Konate lost his place to Jarell Quansah at the end of the season.         What a season it has been for the young Ghanaian defender, who has made the step up from League One to the first team in just his second full season at Anfield.   It was a breakthrough season for QuansAH and our latest season review focuses on the 21-year-old, wi

In [15]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# New list to hold the structured documents
structured_documents = []

for item in summarized_items:
    title, link, summary, category, website, published = item
    metadata = {
        'source': link,
        'title': title,
        'description': summary,
        'category': category,
        'website': website,
        'published': published,
        'language': 'en-US'  # Assuming the language is English
    }
    # Create a Document instance
    document = Document(page_content=summary, metadata=metadata)
    structured_documents.append(document)

In [16]:
import csv

def save_to_csv(documents, filename):
    # Open the file in write mode
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        
        # Write the headers based on Document structure
        headers = ['page_content', 'source', 'title', 'description', 'category', 'website', 'published', 'language']
        writer.writerow(headers)
        
        # Write data
        for doc in documents:
            # Flatten the metadata dictionary and prepare the row
            row = [
                doc.page_content,
                doc.metadata['source'],
                doc.metadata['title'],
                doc.metadata['description'],
                doc.metadata['category'],
                doc.metadata['website'],
                doc.metadata['published'],
                doc.metadata['language']
            ]
            writer.writerow(row)

    print(f"Data successfully saved to {filename}")

save_to_csv(structured_documents, '/kaggle/working/structured_documents.csv')


Data successfully saved to /kaggle/working/structured_documents.csv


In [17]:
updated_df = pd.concat([existing_summaries_df, new_content_df])
updated_df.to_csv('/kaggle/working/structured_documents.csv', index=False)

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(structured_documents)


In [19]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)


In [21]:
chat_history = []

query = "Provide me the latest news on F1. Do not include tags. Only use information present in the context that I have provided."
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

  warn_deprecated(


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.


Get quick access to your favorite articles Manage alerts on breaking news and favorite drivers Make your voice heard with article commenting.  │    The high-stakes design theory that Sauber hopes is key to 2024 F1 progress   ファイナル   α   β   π   γ   ო   サ   Sauber to add 'important' Imola upgrades to cure F1 pitstop woes   ففقف ف   F1 points system like “putting a plaster on a big cut” - Ocon                           Mooted F1 point system like "putting an  ice cube on a large cut" - O Con   م   SOURCE: F1.com  │ │ │  α α  β α  φ   │ α  α β  ρ  チ  α │  ポ  α π ნ  α Sauber C44, leaves the garage Photo by: Sam Bloxham / Motorsport Images "Qualifying confirms that we are lacking performance in the single lap," Sauber team representative Alessandro Alunni Bravi stated. "We have previously shown that we can deliver a better race 

In [22]:
print(result['source_documents'])


