# **Лабораторная работа №5 (Поиск по векторной БД)**

In [None]:
!pip install openai
!pip install evaluate
!pip install llama-cpp-python
!pip install pinecone-client
!pip install langchain==0.0.300
!pip install chromadb==0.4.12
!pip install sentence-transformers==2.2.2

In [None]:
from langchain.document_loaders import PDFMinerLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader, UnstructuredHTMLLoader
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from multiprocessing.pool import ThreadPool
from langchain.vectorstores import Chroma
from langchain.schema import Document
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from typing import Any
from tqdm import tqdm
from nltk.tokenize import sent_tokenize

In [None]:
import pandas as pd
import numpy as np
import statistics
import pinecone
import glob
import os
import chromadb
import nltk

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/ITMO/nlp/train.csv', header=None, names = ['topic','title', 'text'])
df['ID'] = df.index

In [None]:
df

Unnamed: 0,topic,title,text,ID
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",0
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,1
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,2
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,3
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...",4
...,...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...,119995
119996,2,Renteria signing a top-shelf deal,Red Sox general manager Theo Epstein acknowled...,119996
119997,2,Saban not going to Dolphins yet,The Miami Dolphins will put their courtship of...,119997
119998,2,Today's NFL games,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...,119998


In [None]:
nltk.download('punkt')

# функция разбивает текст на предложения
def cut_text_by_sent(text, fragment_len=200, overlay=100):
    # text - исходный текст
    # fragment_len - длина каждого фрагмента текста (по умолчанию 200)
    # overlay - перекрытие между фрагментами
    sentences = sent_tokenize(text)
    fragments = []
    current_fragment = []
    current_len = 0
    # проход по каждому предложению
    for sent in sentences:
        # если предложение короче 200 символов, то оно объединяется с соседним
        if current_len + len(sent) <= fragment_len:
            current_fragment.append(sent)
            current_len += len(sent)
        # если длиннее, то запись в "ячейку" заканчивается
        else:
            if current_fragment:
                fragments.append(' '.join(current_fragment))
            current_fragment = [sent]
            current_len = len(sent)

    final_fragments = []
    # если предложение очень длинное, то оно разбивается на несколько ячеек
    for fragment in fragments:
        if len(fragment) > fragment_len:
            all_len = 0
            len_text = len(fragment)

            while all_len + fragment_len <= len_text:
                final_fragments.append(fragment[all_len:all_len + fragment_len])
                all_len += overlay
        else:
            final_fragments.append(fragment)

    return final_fragments

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Проверка
text = 'Необходимо записать ваш датасет в векторную базу данных и выполнить эксперименты по поиску схожих фрагментов текста, соответствующих запросу. Дополнительные баллы: провести эксперименты с разными системами векторизации и алгоритмами similarity. Сравнить средний порядковый номер требуемого фрагмента в отсортированном по релевантности спике результатов. Примеры классов, которые могут потребоваться для выполнения данного задания описаны в ноутбуке.'
text_fragments = cut_text_by_sent(text)

In [None]:
text_fragments

['Необходимо записать ваш датасет в векторную базу данных и выполнить эксперименты по поиску схожих фрагментов текста, соответствующих запросу.',
 'Дополнительные баллы: провести эксперименты с разными системами векторизации и алгоритмами similarity.',
 'Сравнить средний порядковый номер требуемого фрагмента в отсортированном по релевантности спике результатов.']

In [None]:
[len(f) for f in text_fragments]

[141, 102, 108]

# **paraphrase-multilingual-mpnet-base-v2**

In [None]:
class Loader:
  # загружает одиночный документ из указанного файла
  def load_single_document(self, file_path: str):
    pass

  # загружает документы из указанной директории
  def load_documents(self, source_dir: str):
    pass

class Embedder():
  # векторизация текстовых предложений с использованием модели SentenceTransformer
  def __init__(self):
    self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

  # возвращает векторные представления для заданных предложений
  def get_embeddings(self, sentences):
    return [[float(e) for e in list(emb)] for emb in list(self.model.encode(sentences))]

class ChromaDB():
  # инициализация клиента ChromaDB
  def __init__(self):
    # использование клиента без сохранения на диск
    # self.client = chromadb.Client()
    # создает клиента с постоянным хранением на диске
    self.client = chromadb.PersistentClient(path="/content/gdrive/MyDrive/nlp")

  # удаление коллекции с указанным именем
  def clear(self, name):
    self.client.delete_collection(name=name)
    return self.client.list_collections()

  # получение коллекции с указанным именем
  def get_collection(self, name):
    return self.client.get_collection(name=name)

  # возвращение списка доступных коллекций
  def get_collections(self):
    return self.client.list_collections()

class ChromaCollection():
  # инициализация коллекции с заданным именем, схожестью и клиентом ChromaDB
  def __init__(self, collection_name, similarity, client):
    self.collection_name = collection_name
    self.similarity = similarity
    self.client = client
    self.collection = self.client.get_or_create_collection(name=collection_name, metadata={"hnsw:space": similarity})

  # добавление документов в коллекцию с соответствующими метаданными (темами)
  def add(self, embeddings, texts, topics, ids):
    self.collection.add(
        embeddings = embeddings,
         documents = texts,
         metadatas = [{"source": "df", "topic":f"{topic}"} for i, topic in enumerate(topics)],
         ids = [f'id {i}' for i in ids]
)

  # поиск схожих документов в коллекции на основе заданных эмбеддингов и возвращает указанное кол-во результатов
  def query(self, embeddings, n_results):
    return self.collection.query(
      query_embeddings=embeddings,
       n_results=n_results,
    )

  # возвращение всех документов в коллекции
  def get(self):
    return self.collection.get()

  # возвращение кол-ва документов в коллекции
  def count(self):
    return self.collection.count()

In [None]:
# создается объект класса Embedder и присваивается переменной embedder
embedder = Embedder()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
embeds = embedder.get_embeddings(df['text'][:30000])
#embeds

In [None]:
# объявление объекта класса ChromaDB, который может быть использован для выполнения операций в векторной БД
client = ChromaDB()
client.get_collections()

[Collection(name=l2_sim), Collection(name=cos_sim), Collection(name=Ip_sim)]

* **Косинусное сходство (Cosine Similarity)**: Этот алгоритм измеряет косинус угла между двумя векторами, представляющими текстовые фрагменты. Более высокое значение косинусного сходства указывает на более близкое сходство между фрагментами.
* **Евклидово расстояние (Euclidean Distance)**: Этот алгоритм измеряет расстояние между двумя точками в n-мерном пространстве. Для текстовых фрагментов, которые представлены как точки в пространстве, меньшее значение евклидова расстояния указывает на более близкое сходство.
* **IP-расстояние (Integral Projection Distance)**: Этот алгоритм измеряет сходство между двумя распределениями, основываясь на их форме и значении проекций. Для этого оно вычисляет площадь между интегральными проекциями двух распределений. Чем меньше площадь между проекциями, тем больше схожесть между распределениями.

In [None]:
# 'cos_sim', 'l2_sim' и 'Ip_sim' - имена коллекций, которые будут созданы в БД ChromaDB
# 'cosine', 'l2' и 'ip' - типы схожести (similarity) для каждой коллекции.
# cosine - косинусное расстояние
# l2 - евклидово расстояние
# ip - произведение скалярного умножения
# client.client - объект клиента ChromaDB, через который будет осуществляться доступ к БД
collection_cos = ChromaCollection('cos_sim', 'cosine', client.client)
collection_l2 = ChromaCollection('l2_sim', 'l2', client.client)
collection_Ip = ChromaCollection('Ip_sim', 'ip', client.client)

In [None]:
# добавление документов в каждую из трех коллекций в векторную БД
# embeds - векторные представления документов, которые нужно добавить в коллекцию
# texts - тексты документов, которые нужно добавить в коллекцию
# topics - темы (метаданные) документов, которые нужно добавить в коллекцию
# ids - идентификаторы документов, которые нужно добавить в коллекцию
collection_cos.add(embeds[0:30000], df['text'].values.tolist()[0:30000], df['topic'].values.tolist()[0:30000], df['ID'].values.tolist()[0:30000])
collection_l2.add(embeds[0:30000], df['text'].values.tolist()[0:30000], df['topic'].values.tolist()[0:30000], df['ID'].values.tolist()[0:30000])
collection_Ip.add(embeds[0:30000], df['text'].values.tolist()[0:30000], df['topic'].values.tolist()[0:30000], df['ID'].values.tolist()[0:30000])

In [None]:
# получение всех документов, хранящихся в коллекции ChromaDB
# collection_cos.get()

In [None]:
print(collection_cos.count())
print(collection_l2.count())
print(collection_Ip.count())

30000
30000
30000


In [None]:
# берем строчку из датасета и формируем по ней вопрос
questions = [
    # Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
    'What are short-sellers seeing again?',
    # Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
    'What is Carlyle Group known for?',
    # Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
    'What factors are expected to hang over the stock market next week?',
    # Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.
    'Why have authorities halted oil export flows from the main pipeline in southern Iraq?',
    # Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.
    'What economic menace do world oil prices present?',

    # Was absenteeism a little high\on Tuesday among the guys at the office? EA Sports would like\to think it was because "Madden NFL 2005" came out that day,\and some fans of the football simulation are rabid enough to\take a sick day to play it.
    'Why was the absenteeism high on Tuesday among the guys at the office?',
    # A group of technology companies\including Texas Instruments Inc. (TXN.N), STMicroelectronics\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\will propose a new wireless networking standard up to 10 times\the speed of the current generation.
    'Which technology companies are proposing a new wireless networking standard with speeds up to 10 times faster than the current generation?',
    # America Online on Thursday said it\plans to sell a low-priced PC targeting low-income and minority\households who agree to sign up for a year of dialup Internet\service.
    'What is the plan of America Online to target low-income and minority households with a low-priced PC and a year of dial-up Internet service?',
    # A group of consumer electronics\makers said on Wednesday they approved the format for a new\generation of discs that can store five times the data of DVDs\at the same cost -- enough to put a full season of "The\Sopranos" on one disc.
    'Which consumer electronics makers have approved the format for new discs capable of storing five times more data than DVDs?',
    # The mystery of what went wrong for the\software industry in late June when sales stalled at more than\20 brand-name companies is not even close to being solved\although the third quarter is nearly halfway over.
    'What is the current status of solving the mystery behind the software industry sales slump in late June, despite being halfway through the third quarter?',

    # Michael Phelps took care of qualifying for the Olympic 200-meter freestyle semifinals Sunday, and then found out he had been added to the American team for the evening's 400 freestyle relay final. Phelps' rivals Ian Thorpe and Pieter van den Hoogenband and teammate Klete Keller were faster than the teenager in the 200 free preliminaries.
    'Who did take care of qualifying for the Olympic 200-meter freestyle semifinals Sunday?',
    # Wily Mo Pena homered twice and drove in four runs, helping the Cincinnati Reds beat the San Diego Padres 11-5 on Saturday night. San Diego was knocked out of a share of the NL wild-card lead with the loss and Chicago's victory over Los Angeles earlier in the day.
    'How did Wily Mo Pena contribute to the Cincinnati Reds victory over the San Diego Padres?',
    # National Basketball Association players trying to win a fourth consecutive Olympic gold medal for the United States have gotten the wake-up call that the "Dream Team" days are done even if supporters have not.
    'What realization have National Basketball Association players had about the chances of winning a fourth consecutive Olympic gold medal?',
    # The Cleveland Indians pulled within one game of the AL Central lead, scoring four runs in the first inning and beating the Minnesota Twins 7-1 Saturday night behind home runs by Travis Hafner and Victor Martinez.
    'How did the Cleveland Indians narrow the gap in the AL Central standings with their win over the Minnesota Twins?',
    # Kevin Hartman made seven saves for Los Angeles, and Jon Busch had two saves for Columbus as the Galaxy and Crew played to a 0-0 tie Saturday night.
    'How did the goalkeepers Kevin Hartman and Jon Busch influence the outcome of the match between the Los Angeles Galaxy and Columbus Crew?',

    # Venezuelans turned out early\and in large numbers on Sunday to vote in a historic referendum\that will either remove left-wing President Hugo Chavez from\office or give him a new mandate to govern for the next two\years.
    'What is the purpose of the historic referendum in Venezuela that drew a large voter turnout?',
    # South Korean police used water cannon in\central Seoul Sunday to disperse at least 7,000 protesters\urging the government to reverse a controversial decision to\send more troops to Iraq.
    'Why did South Korean police use water cannon to disperse thousands of protesters in central Seoul?',
    # Thousands of Palestinian\prisoners in Israeli jails began a hunger strike for better\conditions Sunday, but Israel's security minister said he\didn't care if they starved to death.
    'What initiated the hunger strike by thousands of Palestinian prisoners in Israeli jails, despite concerns over their well-being?',
    # Sporadic gunfire and shelling took place overnight in the disputed Georgian region of South Ossetia in violation of a fragile ceasefire, wounding seven Georgian servicemen.
    'What happened in the disputed Georgian region of South Ossetia overnight, violating the fragile ceasefire and causing injuries to Georgian servicemen?',
    # Dozens of Rwandan soldiers flew into Sudan's troubled Darfur region Sunday, the first foreign armed force deployed in the area since Arab militiamen began a rampage against black African farmers, killing thousands.
    'What is the significance of the deployment of Rwandan soldiers to Sudan troubled Darfur region amidst the ongoing violence between Arab militiamen and black African farmers?'
    ]

In [None]:
# генерация эмбедингов для списка вопросов
q_embeds = embedder.get_embeddings(questions)

In [None]:
# ожидаемое максимальное кол-во результатов поиска до 1000
results_cos = collection_cos.query(q_embeds,1000)
# вывод первых 10-ти результатов
results_cos['documents'][0][:10]

[" NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling  band of ultra-cynics, are seeing green again.",
 "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'Some shoddy reports trickling in Thursday night threw a wrench into an otherwise uneventful second-quarter earnings season for retailers. At least one analyst thinks its a harbinger, as consumers get cautious about spending money on anything that isn #39;t ...',
 'Many sellers and a few longtime analysts of eBay say the company is generating fewer of the bidding frenzies that once translated into profits.',
 'The company saw an improvement in its trades, but will this market be kind to the brokerages?',
 'The retail sector overall may be reporting a sluggish start to the season, but holiday shoppers are scooping up tech goods at a brisk pace -- and they\'re scouring the Web for bargains more than ever. &lt;FONT face="verdana,MS Sans Serif,arial,helvetica" size="-2" color="#66

In [None]:
ind_cos = []
for i, res in enumerate(results_cos['ids']):
  try:
    ind_cos.append(res.index(f'id {i}'))
  except:
    ind_cos.append(1000)

print(sum(ind_cos) / len(ind_cos))

748.6


In [None]:
results_l2 = collection_l2.query(q_embeds,1000)
results_l2['documents'][0][:10]

[" NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling  band of ultra-cynics, are seeing green again.",
 "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'Some shoddy reports trickling in Thursday night threw a wrench into an otherwise uneventful second-quarter earnings season for retailers. At least one analyst thinks its a harbinger, as consumers get cautious about spending money on anything that isn #39;t ...',
 'The retail sector overall may be reporting a sluggish start to the season, but holiday shoppers are scooping up tech goods at a brisk pace -- and they\'re scouring the Web for bargains more than ever. &lt;FONT face="verdana,MS Sans Serif,arial,helvetica" size="-2" color="#666666"&gt;&lt;B&gt;-washingtonpost.com&lt;/B&gt;&lt;/FONT&gt;',
 'Many sellers and a few longtime analysts of eBay say the company is generating fewer of the bidding frenzies that once translated into profits.',
 "By ANNE D'INNOCENZIO    NEW YORK 

In [None]:
ind_l2 = []
for i, res in enumerate(results_l2['ids']):
  try:
    ind_l2.append(res.index(f'id {i}'))
  except:
    ind_l2.append(1000)
sum(ind_l2) / len(ind_l2)

733.6

In [None]:
results_ip = collection_Ip.query(q_embeds,1000)
results_ip['documents'][0][:10]

[" NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling  band of ultra-cynics, are seeing green again.",
 "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'Sales were lower than expected, and the quality of earnings is in question.',
 'Something odd happened on the way to global recovery. The shoppers are not turning out the way they normally do when things start to look up.',
 'The company saw an improvement in its trades, but will this market be kind to the brokerages?',
 'Many sellers and a few longtime analysts of eBay say the company is generating fewer of the bidding frenzies that once translated into profits.',
 'When a company with a small float comes unhinged from reality, look out.',
 'The California department store operator might look interesting to acquirers.',
 'Economy: With consumer confidence down and spending up, shoppers wrestle with  #39;stop-and-go #39; mixed signals. By Andrea K. Walker.',
 'Irrational mark

In [None]:
ind_ip = []
for i, res in enumerate(results_ip['ids']):
  try:
    ind_ip.append(res.index(f'id {i}'))
  except:
    ind_ip.append(1000)

sum(ind_ip) / len(ind_ip)

738.15

# **Лабораторная работа №6 (Question Answering)**

In [None]:
questions = [
    'What are short-sellers seeing again?',
    'What is Carlyle Group known for?',
    'What factors are expected to hang over the stock market next week?',
    'Why have authorities halted oil export flows from the main pipeline in southern Iraq?',
    'What economic menace do world oil prices present?',

    'Why was the absenteeism high on Tuesday among the guys at the office?',
    'Which technology companies are proposing a new wireless networking standard with speeds up to 10 times faster than the current generation?',
    'What is the plan of America Online to target low-income and minority households with a low-priced PC and a year of dial-up Internet service?',
    'Which consumer electronics makers have approved the format for new discs capable of storing five times more data than DVDs?',
    'What is the current status of solving the mystery behind the software industry sales slump in late June, despite being halfway through the third quarter?',

    'Who did take care of qualifying for the Olympic 200-meter freestyle semifinals Sunday?',
    'How did Wily Mo Pena contribute to the Cincinnati Reds victory over the San Diego Padres?',
    'What realization have National Basketball Association players had about the chances of winning a fourth consecutive Olympic gold medal?',
    'How did the Cleveland Indians narrow the gap in the AL Central standings with their win over the Minnesota Twins?',
    'How did the goalkeepers Kevin Hartman and Jon Busch influence the outcome of the match between the Los Angeles Galaxy and Columbus Crew?',

    'What is the purpose of the historic referendum in Venezuela that drew a large voter turnout?',
    'Why did South Korean police use water cannon to disperse thousands of protesters in central Seoul?',
    'What initiated the hunger strike by thousands of Palestinian prisoners in Israeli jails, despite concerns over their well-being?',
    'What happened in the disputed Georgian region of South Ossetia overnight, violating the fragile ceasefire and causing injuries to Georgian servicemen?',
    'What is the significance of the deployment of Rwandan soldiers to Sudan troubled Darfur region amidst the ongoing violence between Arab militiamen and black African farmers?'

    'Who won the FIFA World Cup in 2018?',
    'Who is the 46th President of the United States?',
    'Which country hosted the 2020 Summer Olympics?',
    'What is the most popular social media platform worldwide?',
    'Which sport requires the use of a shuttlecock?',
    'Who is the CEO of Tesla Inc.?',
    'What is the currency of Japan?',
    'What is the fastest land animal?',
    'What is the capital of Australia?',
    'Who wrote the play "Romeo and Juliet"?'
]

answers = ['Green',
           'Making well-timed and occasionally controversial plays in the defense industry',
           'Soaring crude prices, worries about the economy, and the outlook for earnings',
           'Due to intelligence showing a potential strike on infrastructure by a rebel militia',
           'They present a new economic threat/menace before the US presidential elections',

           'Because of the release of "Madden NFL 2005" football simulation game',
           'Texas Instruments Inc',
           'To sell a low-priced PC to low-income and minority households who sign up for a year of dial-up Internet service',
           'A group of consumer electronics makers',
           'The mystery is still far from being solved',

           'Michael Phelps',
           'He homered twice and drove in four runs',
           'The "Dream Team" days are done',
           'They scored four runs in the first inning and got home runs by Travis Hafner and Victor Martinez',
           'They made saves for their respective teams, resulting in a 0-0 tie',

           'To remove left-wing President Hugo Chavez from office or give him a new mandate to govern for the next two years',
           'The protesters were urging the government to reverse a controversial decision to send more troops to Iraq',
           'The prisoners initiated the hunger strike to demand better conditions',
           'Sporadic gunfire and shelling took place, resulting in injuries to seven Georgian servicemen',
           'African farmers were killing thousands',

           'France',
           'Joe Biden',
           'Japan (Tokyo)',
           'Facebook',
           'Badminton',
           'Elon Musk',
           'Japanese yen',
           'Cheetah',
           'Canberra',
           'William Shakespeare']

In [None]:
# инициализация вопросно-ответной модели roberta-base-squad2
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [None]:
# векторные представления для заданных вопросов
q_embeds = embedder.get_embeddings(questions)
# возвращение 5-ти подходящих результатов для каждого вопроса
results = collection_cos.query(q_embeds,5)
# из 5-ти вариантов выводим первый
results['documents'][0]

[" NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling  band of ultra-cynics, are seeing green again.",
 "Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'Some shoddy reports trickling in Thursday night threw a wrench into an otherwise uneventful second-quarter earnings season for retailers. At least one analyst thinks its a harbinger, as consumers get cautious about spending money on anything that isn #39;t ...',
 'Many sellers and a few longtime analysts of eBay say the company is generating fewer of the bidding frenzies that once translated into profits.',
 'The company saw an improvement in its trades, but will this market be kind to the brokerages?']

In [None]:
!pip install evaluate
!pip install bert_score
from evaluate import load
# предобученная модель для оценки качества
bertscore = load("bertscore")

In [None]:
# список для хранения результатов bertscore
bs_all = []
# q - вопрос, a - ответ на вопрос, index - индекс вопроса
for q, a, index in zip(questions, answers, range(len(answers))):
  # создается словарь для вопроса и контента из датасета
  QA_input = {'question': q,
             'context': ' '.join(results['documents'][index])}
  res = nlp(QA_input)
  # вычисляет метрику BERTScore между предсказанным ответом и референсным ответом
  bs = bertscore.compute(predictions=[res['answer']], references=[a], lang="en")
  bs_all.append(bs)

  # результат оценки для каждой пары вопрос - ответ
  print(f'Question: {q}\nAnswer: {res["answer"]}\nUser answer: {a}\nScore: {bs["f1"][0]}\n ')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: What are short-sellers seeing again?
Answer: green
User answer: Green
Score: 0.9647288918495178
 
Question: What is Carlyle Group known for?
Answer: making well-timed and occasionally\controversial plays in the defense industry
User answer: Making well-timed and occasionally controversial plays in the defense industry
Score: 0.9816475510597229
 
Question: What factors are expected to hang over the stock market next week?
Answer: Soaring crude prices plus worries  about the economy and the outlook for earnings
User answer: Soaring crude prices, worries about the economy, and the outlook for earnings
Score: 0.9631632566452026
 
Question: Why have authorities halted oil export flows from the main pipeline in southern Iraq?
Answer: after\intelligence showed a rebel militia could strike\infrastructure
User answer: Due to intelligence showing a potential strike on infrastructure by a rebel militia
Score: 0.9038518071174622
 
Question: What economic menace do world oil prices presen



Question: How did the goalkeepers Kevin Hartman and Jon Busch influence the outcome of the match between the Los Angeles Galaxy and Columbus Crew?
Answer: 
User answer: They made saves for their respective teams, resulting in a 0-0 tie
Score: 0.0
 
Question: What is the purpose of the historic referendum in Venezuela that drew a large voter turnout?
Answer: will keep him in power
User answer: To remove left-wing President Hugo Chavez from office or give him a new mandate to govern for the next two years
Score: 0.8706334233283997
 
Question: Why did South Korean police use water cannon to disperse thousands of protesters in central Seoul?
Answer: the government to reverse a controversial decision to\send more troops to Iraq
User answer: The protesters were urging the government to reverse a controversial decision to send more troops to Iraq
Score: 0.9283509850502014
 
Question: What initiated the hunger strike by thousands of Palestinian prisoners in Israeli jails, despite concerns over

In [None]:
# среднее значение метрики по всем вопросам
f1_scores = [bs['f1'][0] for bs in bs_all]
sum(f1_scores)/len(f1_scores)

0.8582716234799089

# **Gradio**

In [None]:
!pip install gradio==3.48.0
import gradio as gr

In [None]:
# функция принимает вопрос, ищет подходящий контент и генерирует ответ на вопрос
def echo(question, history):
    q_embeds = embedder.get_embeddings([question])
    # выполнение поиска схожих документов в коллекции
    results = collection_cos.query(q_embeds,5)
    QA_input = {'question': question,
             'context': ' '.join(results['documents'][0])}
    res = nlp(QA_input)
    return res['answer']

In [None]:
# графический интерфейс
demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="QA Bot")
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://0e0c6121748b0eb742.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


