In [86]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering, pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [66]:
# Задание 1 Анализ производительности QA-модели

# Загрузка и преобразование датасета из 10 предложений и по 3 вопроса к каждому
data = pd.read_csv('/Users/ilia/ATAT/Sem9/Без названия 2.csv', sep=';') # Локальный путь к файлу "Без названия 2.csv"
print(data)
contexts = data['Текст'].tolist()  # Извлекаем столбец 'Текст'
questions_1 = data['Вопрос1'].tolist()  # Извлекаем столбец 'вопрос1'
questions_2 = data['Вопрос2'].tolist()  # Извлекаем столбец 'вопрос2'
questions_3 = data['Вопрос3'].tolist()

# print(contexts[0])

                                               Текст  \
0  The sun is the star at the center of our solar...   
1  Elephants are the largest land animals. They h...   
2  The Eiffel Tower was completed in 1889 as the ...   
3  Water boils at 100°C at sea level. However, at...   
4  The Amazon Rainforest is home to millions of s...   
5  The Great Wall of China stretches over 13,000 ...   
6  William Shakespeare was an English playwright ...   
7  The Pacific Ocean is the largest and deepest o...   
8  The human brain consists of billions of neuron...   
9  The Mona Lisa is a famous painting by Leonardo...   

                                             Вопрос1  \
0                                   What is the sun?   
1                   What is the largest land animal?   
2             When was the Eiffel Tower completed?\n   
3  At what temperature does water boil at sea lev...   
4            Where is the Amazon Rainforest located?   
5             How long is the Great Wall of Chi

In [70]:
# Загрузка модели MiniLM и токенизатора
model_id_minilm = 'deepset/minilm-uncased-squad2'
tokenizer_minilm = AutoTokenizer.from_pretrained(model_id_minilm)
model_minilm = TFAutoModelForQuestionAnswering.from_pretrained(model_id_minilm, from_pt=True)
qa_pipeline_minilm = pipeline('question-answering', model=model_minilm, tokenizer=tokenizer_minilm)

# Обучение модели модели MiniLM на основе датасета
def get_answers(contexts, questions):
    for i, context in enumerate(contexts):
        print(f"Текст {i + 1}: {context[:100]}...")  # Печатаем начало текста
        for j, question in enumerate([questions_1[i], questions_2[i], questions_3[i]]):
            output = qa_pipeline_minilm(question=question, context=context, handle_impossible_answer=True)
            # Выводы
            if output['start'] != output['end']:
                print(f"  Вопрос {j + 1}: {question}")
                print(f"  Ответ: {output['answer']} (Оценка: {output['score']:.1%})")
            else:
                print(f"  Вопрос {j + 1}: {question} -> Нет ответа")
        print("-" * 50)

# Тестирование модели
get_answers(contexts, [questions_1, questions_2, questions_3])

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.
Device set to use 0


Текст 1: The sun is the star at the center of our solar system. It provides the energy necessary for life on ...
  Вопрос 1: What is the sun?
  Ответ: the star at the center of our solar system (Оценка: 54.5%)
  Вопрос 2: How does the sun provide energy?

  Ответ: nuclear fusion (Оценка: 60.8%)
  Вопрос 3: What process occurs in the sun?
  Ответ: nuclear fusion (Оценка: 96.4%)
--------------------------------------------------
Текст 2: Elephants are the largest land animals. They have strong social bonds and communicate using infrasou...
  Вопрос 1: What is the largest land animal?
  Ответ: Elephants (Оценка: 99.6%)
  Вопрос 2: How do elephants communicate?

  Ответ: infrasound (Оценка: 64.0%)
  Вопрос 3: What is infrasound?
  Ответ: inaudible to humans (Оценка: 81.0%)
--------------------------------------------------
Текст 3: The Eiffel Tower was completed in 1889 as the entrance arch for the World's Fair in Paris. It remain...
  Вопрос 1: When was the Eiffel Tower completed?

  Отве

In [90]:
# Задание 2: Тестирование других моделей
# Загрузка модели BERT и токенизатора
model_id_bert = 'bert-large-uncased-whole-word-masking-finetuned-squad'
tokenizer_bert = AutoTokenizer.from_pretrained(model_id_bert)
model_bert = TFAutoModelForQuestionAnswering.from_pretrained(model_id_bert, from_pt=True)
qa_pipeline_bert = pipeline('question-answering', model=model_bert, tokenizer=tokenizer_bert)

# Функция для тестирования модели BERT
def test_bert(contexts, questions):
    for i, context in enumerate(contexts):
        print(f"Текст {i + 1}: {context[:100]}...")  # Печатаем начало текста
        for j, question in enumerate([questions_1[i], questions_2[i], questions_3[i]]):
            output = qa_pipeline_bert(question=question, context=context, handle_impossible_answer=True)
            if output['start'] != output['end']:
                print(f"  Вопрос {j + 1}: {question}")
                print(f"  Ответ: {output['answer']} (Оценка: {output['score']:.1%})")
            else:
                print(f"  Вопрос {j + 1}: {question} -> Нет ответа")
        print("-" * 50)

# Тестирование модели BERT
test_bert(contexts, [questions_1, questions_2, questions_3])

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.
Device set to use 0


Текст 1: The sun is the star at the center of our solar system. It provides the energy necessary for life on ...
  Вопрос 1: What is the sun?
  Ответ: the star at the center of our solar system (Оценка: 63.2%)
  Вопрос 2: How does the sun provide energy?

  Ответ: nuclear fusion (Оценка: 71.8%)
  Вопрос 3: What process occurs in the sun?
  Ответ: nuclear fusion (Оценка: 90.9%)
--------------------------------------------------
Текст 2: Elephants are the largest land animals. They have strong social bonds and communicate using infrasou...
  Вопрос 1: What is the largest land animal?
  Ответ: Elephants (Оценка: 99.9%)
  Вопрос 2: How do elephants communicate?

  Ответ: infrasound (Оценка: 52.1%)
  Вопрос 3: What is infrasound?
  Ответ: inaudible to humans (Оценка: 90.4%)
--------------------------------------------------
Текст 3: The Eiffel Tower was completed in 1889 as the entrance arch for the World's Fair in Paris. It remain...
  Вопрос 1: When was the Eiffel Tower completed?

  Отве

In [None]:
#Задание 3

# Загрузка модели MiniLM и токенизатора
model_id_minilm = 'bert-large-uncased-whole-word-masking-finetuned-squad'  # Заменяем модель
tokenizer_minilm = AutoTokenizer.from_pretrained(model_id_minilm)
model_minilm = TFAutoModelForQuestionAnswering.from_pretrained(model_id_minilm, from_pt=True)
qa_pipeline_minilm = pipeline('question-answering', model=model_minilm, tokenizer=tokenizer_minilm)

# Создание TF-IDF векторизатора
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(contexts)

# Функция для выбора наиболее релевантных контекстов (TF-IDF)
def get_best_contexts_tfidf(query, contexts, vectorizer, max_matches=5):  # Увеличиваем количество контекстов до 5
    query_vector = vectorizer.transform([query])
    scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    top_indices = scores.argsort()[-max_matches:][::-1]
    return [contexts[idx] for idx in top_indices]

# Функция для вывода ответов
def show_answers_tfidf(questions, contexts, vectorizer):
    for i, question in enumerate(questions):
        best_contexts = get_best_contexts_tfidf(question, contexts, vectorizer)
        print(f"Вопрос {i + 1}: {question}")
        found_answer = False
        for context in best_contexts:
            output = qa_pipeline_minilm(question=question, context=context, handle_impossible_answer=True)
            if output['start'] != output['end']:
                print(f"  Ответ: {output['answer']} (Оценка: {output['score']:.1%})")
                found_answer = True
        if not found_answer:
            print("  Нет ответа")
        print("-" * 50)

# Тестирование TF-IDF извлекателя
all_questions = questions_1 + questions_2 + questions_3
show_answers_tfidf(all_questions, contexts, vectorizer)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.
Device set to use 0


Вопрос 1: What is the sun?
  Ответ: the star at the center of our solar system (Оценка: 63.2%)
  Ответ: mysterious smile (Оценка: 8.9%)
  Ответ: Pacific Ocean is the largest and deepest ocean on Earth (Оценка: 5.9%)
  Ответ: Eiffel Tower (Оценка: 2.3%)
  Ответ: oxygen (Оценка: 13.4%)
--------------------------------------------------
Вопрос 2: What is the largest land animal?
  Ответ: Elephants (Оценка: 99.9%)
  Ответ: Pacific Ocean (Оценка: 13.2%)
  Ответ: The Mona Lisa (Оценка: 11.0%)
  Ответ: The sun (Оценка: 13.6%)
--------------------------------------------------
Вопрос 3: When was the Eiffel Tower completed?

  Ответ: 1889 (Оценка: 99.4%)
  Ответ: 7th century BC (Оценка: 40.1%)
  Ответ: nuclear fusion. (Оценка: 3.2%)
  Ответ: Shakespeare (Оценка: 1.8%)
  Ответ: oxygen (Оценка: 2.6%)
--------------------------------------------------
Вопрос 4: At what temperature does water boil at sea level?

  Ответ: 100°C (Оценка: 99.6%)
  Ответ: nuclear fusion (Оценка: 2.1%)
  Ответ: chemical