# FTN Q&A Chatbot

This code is made for using in Google Colab. For additional usage on locel machine it need to be slightly modified.

## Basic libraries importing

In [1]:
import os
from google.colab import drive
from IPython.display import HTML, display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Constants

In [2]:
# Paths
PROJECT_PATH = "project path"
DATA_PATH = "data path"
QA_BASE_PATH = "questions_answers_data.csv path"
DATABASE_PATH = "database path"

# Context Embedding and base creation
BASE_CREATION_FLAG = 1
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
DEVICE = "cpu"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

#Chatbot parameters
CHATBOT_FLAG = 1
REPLICATE_API_TOKEN = "your replicate token"
# CHATBOT_MODEL = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
CHATBOT_MODEL = "meta/llama-2-70b-chat:2d19859030ff705a87c746f7e96eea03aefb71f166725aee39692f1476566d48"
TEMPERATURE = 0.01
TOP_PERCENT = 0.5
MAX_NEW_TOKENS = 1000

# Initial chat history questions and answers
CHAT_HISTORY = [
    ("U kom gradu se nalazi Fakultet tehničkih nauka?", "U Novom Sadu."),
    ("Kad je osnovan?", "1960."),
    ("Koja je web stranica fakulteta?", "www.ftn.uns.ac.rs")
]

# Other
LINE = 150*'-'
DOUBLE_LINE = 150*'='

## Loading drive

In [3]:
print(DOUBLE_LINE)

drive.mount("/content/drive")

print(LINE)

print("Project path:")
%cd {PROJECT_PATH}

print(DOUBLE_LINE)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
------------------------------------------------------------------------------------------------------------------------------------------------------
Project path:
/content/drive/MyDrive/FTN_Chatbot


## Loading aditional libraries

### Aditional libraries installation

In [4]:
!pip install -r requirements.txt



### Aditional libraries imporing

In [5]:
import replicate

import pypdf
import PyPDF2
from transliterate import translit

import langchain
from langchain.llms import Replicate
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.prompts.prompt import PromptTemplate

## Data preprocessing

### Question-answer data loading and showing

In [6]:
qa_df = pd.read_csv(QA_BASE_PATH)

print(DOUBLE_LINE)

print("Question-answer base:")

display(qa_df.head(20))

print(DOUBLE_LINE)

Question-answer base:


Unnamed: 0,Pitanje,Odgovor,Dokument
0,Do kada traje upis?,"""Upis u skolsku 2023/24. godinu potrebno je iz...",Informator_2023/2024
1,Kako izgleda ispit za upis na arhitekturu?,"""Prijemni ispit za studije arhitekture obuhvat...",OAS22
2,Koliko bodova treba da imam za upis na budzet?,"""Pravo na buzdet imaju svi studenti koji su u ...",Informator_2023/2024
3,Koliki su troskovi upisa u skolsku 2023/2024?,"""Troškovi upisa za studente bez zaostalih pred...",Informator_2023/2024
4,Koliko studenti placaju nepolozene ispite?,"""Nepolozene ispite studenti placaju po bodu. C...",Informator_2023/2024
5,Sta se desava sa studentima kojima istice stat...,"""Studenti kojima ističe status studenta zbog i...",Informator_2023/2024
6,Sta je vizija fakulteta?,"""Vizija fakulteta je zasnovana na dugoročnoj o...",Strategija_fakulteta
7,Koje se oblasti izucavaju na Fakultetu tehnick...,"""Na Fakultetu tehničkih nauka izucavaju se sle...",OAS22
8,Koliko traju osnovne akademske studije masinstav?,"""Osnovne akademsk studije traju cetiri godine ...",OAS22
9,Da li je Fakutet tehnickih nauka akreditovan?,""" Fakultet tehnickih nauka akreditovan je za o...",OAS22




### Context data showing

In [7]:
def get_pdf_page_count(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            return len(reader.pages)
    except Exception as e:
        print(f"Error reading '{file_path}': {e}")
        return "N/A"

files_info = []

for file_name in os.listdir(DATA_PATH):
    if file_name.lower().endswith('.pdf'):
        file_path = os.path.join(DATA_PATH, file_name)
        page_count = get_pdf_page_count(file_path)
        files_info.append((file_name, page_count))

print(DOUBLE_LINE)
print(f"PDF files inside path \"{PROJECT_PATH}/{DATA_PATH}\":")
print(LINE)

for i, (file_name, page_count) in enumerate(files_info, start=1):
  if i>1:
    print(LINE)
  print(f"{i}. \"{file_name}\" - Pages: {page_count}")

print(DOUBLE_LINE)

PDF files inside path "/content/drive/MyDrive/FTN_Chatbot/data":
------------------------------------------------------------------------------------------------------------------------------------------------------
1. "FTN_monografija.pdf" - Pages: 730
------------------------------------------------------------------------------------------------------------------------------------------------------
2. "OAS22.pdf" - Pages: 56
------------------------------------------------------------------------------------------------------------------------------------------------------
3. "Informator_2023_2024.pdf" - Pages: 2
------------------------------------------------------------------------------------------------------------------------------------------------------
4. "FTNnovine67.pdf" - Pages: 24
------------------------------------------------------------------------------------------------------------------------------------------------------
5. "Strategija_fakulteta.PDF" - Pages: 6


### Context data loading and script converting

In [8]:
def convert_to_latin(sentence):
    latin_sentence = translit(sentence, 'sr', reversed=True)

    return latin_sentence

if BASE_CREATION_FLAG:
  loader = PyPDFDirectoryLoader(DATA_PATH)
  documents = loader.load()

  for document in documents:
      document.page_content = convert_to_latin(document.page_content)

### Context data spliting and embedding

In [9]:
if BASE_CREATION_FLAG:
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
  splits = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={"device": DEVICE})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Data base creating

In [10]:
if BASE_CREATION_FLAG:
  database = FAISS.from_documents(splits, embeddings)
  database.save_local(DATABASE_PATH)
else:
  database = FAISS.load_local(DATABASE_PATH,embeddings)

## Model creating and testing

### Replicate API activation

In [11]:
if CHATBOT_FLAG:
  os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

### Model creation

In [12]:
if CHATBOT_FLAG:
  chatbot = Replicate(
      model=CHATBOT_MODEL,
      model_kwargs={"temperature": TEMPERATURE,"top_p": TOP_PERCENT, "max_new_tokens":MAX_NEW_TOKENS}
  )

### Q&A chain creation

In [13]:
if CHATBOT_FLAG:
  qa_chain = ConversationalRetrievalChain.from_llm(chatbot, database.as_retriever(search_kwargs={"k": 5}), return_source_documents=True)

### Pipeline creation

In [14]:
if CHATBOT_FLAG:
  def translator(prompt, chatbot):
    output = replicate.run(
      chatbot,
      input={"system_prompt": "Prevesti prompt na srpski jezik ako već nije na srpskom i skratiti ga na jednu rečenicu, bez dodatnog teksta.",
              "prompt": prompt,
              "max_new_tokens":500}
    )

    return convert_to_latin("".join(output).split("\n")[-1])

  def faithfulness_estimation(question, answer, sources, chatbot):
    prompt = f"""
      You must return the following fields in your response in two lines, one below the other:
      score: Your numerical score for the model's faithfulness based on the rubric
      justification: Your reasoning about the model's faithfulness score

      You are an impartial judge. You will be given an input that was sent to a machine
      learning model, and you will be given an output that the model produced. You
      may also be given additional information that was used by the model to generate the output.

      Your task is to determine a numerical score called faithfulness based on the input which is question and output which contain answer and sources.
      A definition of faithfulness and a grading rubric are provided below.
      You must use the grading rubric to determine your score. You must also justify your score.

      Examples could be included below for reference. Make sure to use them as references and to
      understand them before completing the task.

      Input question:
      {question}

      Output (answer):
      {answer}

      Output (sources):
      {sources}

      Metric definition:
      Faithfulness is only evaluated with the provided output and provided context, please ignore the provided input entirely when scoring faithfulness. Faithfulness assesses how much of the provided output is factually consistent with the provided context. A higher score indicates that a higher proportion of claims present in the output can be derived from the provided context. Faithfulness does not consider how much extra information from the context is not present in the output.

      Grading rubric:
      Faithfulness: Below are the details for different scores:
      - Score 1: None of the claims in the output can be inferred from the provided context.
      - Score 2: Some of the claims in the output can be inferred from the provided context, but the majority of the output is missing from, inconsistent with, or contradictory to the provided context.
      - Score 3: Half or more of the claims in the output can be inferred from the provided context.
      - Score 4: Most of the claims in the output can be inferred from the provided context, with very little information that is not directly supported by the provided context.
      - Score 5: All of the claims in the output are directly supported by the provided context, demonstrating high faithfulness to the provided context.

      Examples:

      1. example
        Example answer:
        Cena jednog boda za predmete koji se prenose u narednu godinu dobija se kada se cena skolarine podeli sa 180.

        Additional information used by the model (sources):
        The university's policy states that the cost per point for subjects carried over is calculated by dividing the total tuition fee by 180.

        Example score: 5
        Example justification: The output precisely matches the university's documented policy on the cost per point for subjects carried over to the next year.

      2. example
        Example answer:
        Za skolsku 2023/24. godinu, cena boda za predmete koji se prenose u narednu godinu iznosi 37 eura po bodu, dok je najvisa moguca cena za upis 90 eura po bodu, uz mogucnost da se ova cena poveca do maksimuma od 90 eura po bodu u zavisnosti od politike fakulteta.​

        Additional information used by the model (sources):
        The document specifies a method for calculating the cost per point for carried-over subjects, which does not align with the detailed prices provided in the output.

        Example score: 1
        Example justification: The output introduces specific cost figures and policies not mentioned in the original document, potentially leading to confusion or misinformation.


      You must return the following fields in your response in two lines, one below the other:
      score: Your numerical score for the model's faithfulness based on the rubric
      justification: Your reasoning about the model's faithfulness score

      Do not add additional new lines. Do not add any other fields.
    """

    output = replicate.run(
      chatbot,
      input={"system_prompt": "Calculate faithfulness score based on prompt and justify it.",
              "prompt": prompt,
              "max_new_tokens":500}
    )

    joined_output = ''.join(output)
    splited_output = joined_output.split("\n")

    score = np.nan
    justification = ''

    for line in splited_output:
        if len(line) > 5:
            words = line.split()
            words[0] = words[0].strip()
            processed_line = ' '.join(words)
            if "score" in words[0].lower():
                try:
                    score = eval(words[-1].strip())
                except:
                    raise ValueError(f"Score not found in line: {line}")
            elif "justification" in words[0].lower():
                justification = ' '.join(words[1:])

    return score, justification

  chat_history = CHAT_HISTORY


### Model testing

In [16]:
if CHATBOT_FLAG:
  for i, row in enumerate(qa_df.iterrows()):
    question = row[1]["Pitanje"]
    real_answer = row[1]["Odgovor"]
    source_document = row[1]["Dokument"]

    output = qa_chain({"question": question, "chat_history": chat_history})
    answer = output["answer"]
    source_documents = output["source_documents"]

    faithfulness_score, faithfulness_justification = faithfulness_estimation(question, answer, source_documents, CHATBOT_MODEL)

    answer_translated = translator(answer, CHATBOT_MODEL)
    chat_history.append((question, answer_translated))

    print(DOUBLE_LINE)

    print(f" {i+1}. question: {question}")
    print(LINE)
    print(f" raw answer: {answer}")
    print(f" faithfulness score: {faithfulness_score}")
    print(f" faithfulness justification: {faithfulness_justification}")
    print(f" answer translated: {answer_translated}")
    print(f" real answer: {real_answer}")

    break

  print(DOUBLE_LINE)

 1. question: Do kada traje upis?
------------------------------------------------------------------------------------------------------------------------------------------------------
 raw answer:  Based on the information provided, the enrollment period for undergraduate studies at the Faculty of Technical Sciences in Novi Sad is not explicitly stated. However, we can infer that the enrollment period is approximately one month, as the text mentions that students can apply for studies during the month of June and July. Additionally, the text states that the academic year begins in September, which suggests that the enrollment period is likely to be within those months.
 faithfulness score: 4
 faithfulness justification: The output provides accurate and relevant information about the cost of carrying over credits from one academic year to the next, using specific figures and a clear explanation of how the cost is calculated. While some details may be inferred from the context, the outp