In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q accelerate langchain langchain_community unstructured sentence-transformers chromadb gradio openai langchain-openai tqdm ragas

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.2/806.2 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.4/227.4 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
import torch
import openai
from getpass import getpass
import re

In [None]:
openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

Please provide your OpenAI Key: ··········


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build the Vector DB

In [None]:
%%capture
CHROMA_PATH = "./chroma"
DATA_PATH = "./datasets"

embeding_model_name = "sentence-transformers/LaBSE"
embeding_model_kwargs = {'device': device}
embeding_encode_kwargs = {'normalize_embeddings': False}

embedding_function = HuggingFaceEmbeddings(model_name=embeding_model_name,model_kwargs=embeding_model_kwargs,encode_kwargs=embeding_encode_kwargs)

## Naive Document Splitting

In [None]:
with open('/content/business_law.txt', 'r') as file:
  text = file.read()

In [None]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    return chunks


def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        chunks,
        embedding_function,
        persist_directory=CHROMA_PATH
    )

    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [None]:
generate_data_store()

In [None]:
# backup the sqlite database

## Chunking based on Law Articles

In [None]:
BUSINESS_LAW_TXT_PATH = '/content/drive/MyDrive/IRI_LAW/datasets/business_law.txt'

with open(BUSINESS_LAW_TXT_PATH, 'r') as file:
  text = file.read()

# preprocess
temp = ''
for c in text:
  if c != '\u200c':
    temp += c

text = temp

In [None]:
pattern = r'(^ماده) (\d+)'

matches = re.finditer(pattern, text, re.MULTILINE)
indexes = [m.start() for m in matches]

splitted = []
for i, j in zip([0] + indexes, indexes + [len(text)]):
  splitted.append(text[i:j])

In [None]:
from langchain.docstore.document import Document

chunks = []
for d in splitted:
  chunks.append(Document(page_content=d))

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings()

In [None]:
db = Chroma.from_documents(
    chunks,
    embedding_function,
    # persist_directory='/content/'
)

In [None]:
question = 'شرایط تاجر بودن چیست؟'
r = db.similarity_search_with_relevance_scores(question, k=5)
r

[(Document(page_content='ماده 415 - ورشكستگي تاجر بحكم محكمه بدايت در موارد ذيل اعلام مي شود:\nالف) بر حسب اظهار خود تاجر.\nب) بموجب تقاضاي يك يا چند نفر از طلبكارها.\nج) بر حسب تقاضاي مدعيالعموم بدايت.\n\n'),
  0.8090258016076803),
 (Document(page_content='ماده 350 - هر گاه معامله مشروط بشرط تعليقي باشد دلال پس از حصول شرط مستحق اجرت خواهد بود.\n\n'),
  0.8085643989836073),
 (Document(page_content='ماده 475 - حكم فوق درباره دعوي خياراتي نيز مجري خواهد بود كه تاجر ورشكسته نسبت باموال متصرفي خود يا ديگران دارد مشروط براينكه بر ضرر طلبكارها نباشد.\n\nفصل هفتم - در قرارداد ارفاقي و تصفيه حساب تاجر ورشكسته\n\nمبحث اول - در دعوت طلبكارها و مجمع عمومي آنها\n\n'),
  0.8084869752228929),
 (Document(page_content='ماده 1 - تاجر كسي است كه شغل معمولي خود را معاملات تجارتي قرار بدهد.\n\n'),
  0.8081855198889187),
 (Document(page_content='ماده 533 - هر گاه كسي مال التجارة بتاجر ورشكسته فروخته وليكن هنوز آن جنس نه بخود تاجر ورشكسته تسليم شده و نه بكس ديگر كه بحساب او بياورد آن كس ميتواند باندازة كه 

In [None]:
def find_relevant_results(query, k=3):
  results = db.similarity_search_with_relevance_scores(query, k=k)

  return results

In [None]:
# db.persist()

In [None]:
# backup the sqlite database

# Load the Vector DB

In [None]:
!cp -r /content/drive/MyDrive/IRI_LAW/chroma/Main/ ./chroma

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings()

In [None]:
CHROMA_PATH = "./chroma"
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Configure RAG



## Init Language Model

## MaralGPT-7B

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_name_or_id = "MaralGPT/Maral-7B-alpha-1"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_id)
llm = AutoModelForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
from typing import Any, List, Mapping, Optional

from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM


class CustomLLM(LLM):

    @property
    def _llm_type(self) -> str:
        return "MistralForCausalLM"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
      inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
      outputs = llm.generate(**inputs, generation_config=generation_config)
      return tokenizer.decode(outputs[0], skip_special_tokens=True)

llm = CustomLLM()

In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.5,
    max_new_tokens=300,
    pad_token_id=tokenizer.eos_token_id
)
def process_query(query):
    results = find_relevant_results(query)
    if len(results) == 0 or results[0][1] < 0.25:
        return ("اطلاعاتی که مرتبط با سوال شما باشد را در پایگاه دانش خود پیدا نکردم!")
    context = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt = prompt.format(context=context, question=query)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = llm.generate(**inputs, generation_config=generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## GPT 3.5-turbu

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

## Create Prompt Template

In [None]:
from langchain.prompts import ChatPromptTemplate

PROMPT_TEMPLATE = """
فقط بر اساس متن زیر به سوال پاسخ دهید:

{context}

---

با توجه به متن بالا به سوال پاسخ دهید:
{question}

پاسخ:
"""

prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

## Chain All Elements

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .25, "k": 5})

In [None]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

# Process a Single Question

## Serve in Gradio

In [None]:
import gradio as gr

def run_query(query):
  resp = qa_chain.invoke({"question" : query})
  answer = resp["response"].content
  context = [context.page_content for context in resp["context"]]
  if len(context) == 0:
    context = ['ماده مرتبط با سوال شما یافت نشد.']
  return 'زمینه: \n' + '\n'.join(context) + '\nپاسخ:\n' + answer

with gr.Blocks() as demo:
    query = gr.Textbox(label="سوال", rtl=True, lines=5)
    output = gr.Textbox(label="پاسخ", rtl=True, lines=10)
    greet_btn = gr.Button("Query")
    greet_btn.click(fn=run_query, inputs=query, outputs=output, api_name="run_query")

demo.launch();

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a35a7594ce11e9a47b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


In [None]:
demo.close()

Closing server running on port: 7860


# Evaluation

In [None]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!cp -r /content/drive/MyDrive/IRI_LAW/evaluation ./evaluation

In [None]:
import pandas as pd

BUSSINESS_LAW_PATH = './evaluation/business_law/'

dfs = []

for file_name in os.listdir(BUSSINESS_LAW_PATH):
  if '_' in file_name:
    file_path = os.path.join(BUSSINESS_LAW_PATH, file_name)
    print(f'{file_name}: {len(pd.read_csv(file_path))}')
    dfs.append(pd.read_csv(file_path))

df = pd.concat(dfs)

df.head()

hard_new.csv: 28
easy_new.csv: 29
medium_new.csv: 28


Unnamed: 0,Question,Answer
0,چه شرایطی برای تشکیل و ثبت یک شرکت با مسئولیت ...,برای تشکیل یک شرکت با مسئولیت محدود نیاز به پر...
1,در یک شرکت با مسئولیت محدود، چگونه مدیریت انجا...,مدیریت شرکت با مسئولیت محدود توسط یک یا چند مد...
2,چه فرآیندی باید برای انحلال یک شرکت با مسئولیت...,انحلال شرکت ممکن است در صورت تصمیم شرکاء یا بر...
3,تفاوت‌های اصلی بین شرکت‌های با مسئولیت محدود و...,در شرکت با مسئولیت محدود، مدیریت می‌تواند توسط...
4,چگونه می‌توان یک شرکت تضامنی را به شرکت سهامی ...,تبدیل یک شرکت تضامنی به شرکت سهامی نیاز به تصو...


In [None]:
len(df)

85

In [51]:
test_questions = df["Question"].values.tolist()
test_groundtruths = df["Answer"].values.tolist()

In [52]:
from tqdm import tqdm

answers = []
contexts = []

for question in tqdm(test_questions, desc='Processing'):
  response = qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

Processing: 100%|██████████| 28/28 [01:50<00:00,  3.95s/it]


In [54]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [55]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

In [49]:
res_df = results.to_pandas()
res_df.to_csv('results.csv', index=True)

In [None]:
import csv

data = []
for q, ans, c, gt in zip(test_questions, answers, contexts, test_groundtruths):
  data.append({
      'Question': q,
      'Answer': ans,
      'Contexts': c,
      'Ground Truth': gt
  })
with open("output2.csv", mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=list(data[0].keys()))
    writer.writeheader()
    for row in data:
        writer.writerow(row)