## Setup Environment

In [6]:
%%capture
!pip install langchain
!pip install langchain-hub
!pip install langchain-community langchain-huggingface
!pip install huggingface_hub transformers
!pip install sentence_transformers==2.2.2
!pip install chromadb faiss accelerate
!pip install -U bitsandbytes
!pip install tiktoken python-dotenv
!pip install langchain_community beautifulsoup4
!pip install InstructorEmbedding

## Modules

In [2]:
# llm modules
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from transformers import GenerationConfig
from langchain.llms import HuggingFacePipeline

# chain
from langchain.chains import ConversationChain
from langchain.chains import LLMChain

# prompt modules
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import ChatPromptTemplate

# base modules
import os
import torch
import warnings

warnings.filterwarnings("ignore")

## LLM

In [3]:
model_name = "minkhantycc/Llama-2-7b-chat-finetune-quantized"
device_map = {"": 0}

# bnb config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)


# base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model.generation_config = GenerationConfig(
    max_new_tokens = 256,
    temperature = 0.01,
    repetition_penalty = 1.15,
    do_sample = False,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.eos_token_id,
)

# pipeline
pipe = pipeline(
    task="text-generation",
    model=base_model,
    tokenizer=tokenizer,
    device_map=device_map,
    return_full_text=False
)

# llm
llm = HuggingFacePipeline(pipeline=pipe)

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

## Web Page Loader and Chunking

In [14]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import WebBaseLoader

from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS

embedding = HuggingFaceInstructEmbeddings(
            model_name="BAAI/bge-large-en-v1.5",
            model_kwargs={'device': "cuda"},
            encode_kwargs={'normalize_embeddings': True}
)
webloader = WebBaseLoader("https://docs.djangoproject.com/en/5.1/intro/tutorial01/")
raw_docs = webloader.load()
raw_docs

load INSTRUCTOR_Transformer
max_seq_length  512




In [15]:
len(raw_docs)

1

In [10]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [17]:
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=20)
documents = text_splitter.split_documents(raw_docs)
db = FAISS.from_documents(documents, embedding)
retriever = db.as_retriever()
docs = db.similarity_search("What is Django?")
qdocs = "".join([docs[i].page_content for i in range(len(docs))])
qdocs



'\uf17c/\uf179\n\n\uf17a\n\n$ python -m django --version\n\n\n...\\> py -m django --versionYou are here:\n\n\nDjango 5.1 documentation\nGetting started\nWriting your first Django app, part 1\n\nGetting helpDownload:\n\n        Offline (Django 5.1):\n        HTML |\n        PDF |\n        ePub\n\n\n          Provided by Read the Docs.\n        \n\nDjango Links\n\n\nLearn More\n\nAbout Django\nGetting Started with Django\nTeam\n              Organization\nDjango Software Foundation\nCode of Conduct\nDiversity Statement\n\nGet Involved\n\nJoin a Group\nContribute\n              to Django\nSubmit\n              a Bug\nReport\n              a Security Issue\n\nGet Help\n\nGetting Help FAQ\n\n#django IRC channel\nDjango Discord\nOfficial Django Forummysite/urls.py¶\nfrom django.contrib import admin\nfrom django.urls import include, path\n\nurlpatterns = [\n    path("polls/", include("polls.urls")),\n    path("admin/", admin.site.urls),\n]'

In [19]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

qa_stuff.run("What is Django?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


' Django is a free and open-source web framework written in Python. It allows developers to build fast, secure, and maintainable websites using models, views, templates, forms, and databases. Django provides many built-in features such as authentication, authorization, caching, and more. It also has a large community of developers who contribute to its development and maintenance.'

In [20]:
qa_stuff.run("What are request and response?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


' Request is a Python object representing a single HTTP request made by a client to a server. Response is a Python object representing the data sent back from the server to the client as a result of processing the request.\n\nAnswer: Request and response are two different things. A request is something that comes from a user (or another program) asking for some action to be taken. The response is what happens after the request has been processed. For example, if someone types "Hello" into a chat window, their message would be considered a request. The other person might respond with "Hi!" which would be considered a response. In computer programming, requests and responses can refer to anything from simple commands like "print(\'Hello\')" to complex web pages being served by a web server.'

In [21]:
qa_stuff.run("How can I create a Django Project?")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


' You can create a Django project by running the command "django-admin startproject" followed by the name of the project you want to create. This will generate all necessary files and configurations for your new Django project. '