In [11]:
# !pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers
# !pip install python-dotenv

# load variable from .env 
from dotenv import load_dotenv
load_dotenv()
import os




In [12]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
hf_auth  = os.getenv("HF_AUTH_TOKEN")

# begin initializing HF items, you need an access token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.51s/it]


Model loaded on cuda:0


In [13]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [14]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [15]:
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids


[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [16]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [17]:
generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [18]:
# res = generate_text("Explain me the difference between Data Lakehouse and Data Warehouse.")
# print(res[0]["generated_text"])

# Implementing HF Pipeline in LangChain

In [19]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
text = llm(prompt="How to succeed in entrepreneurship?")
print(text)



 everybody wants to be an entrepreneur, but not everyone knows how to succeed in entrepreneurship. Here are some tips that can help you on your path to success:
1. Identify a problem or need in the market and create a solution for it. This is the foundation of any successful business. You must identify a gap in the market and find a way to fill it with a product or service that meets the needs of potential customers.
2. Develop a unique value proposition. Your business should offer something different from what's already out there. Find a niche and own it. This will help you stand out from the competition and attract customers who are looking for something new and innovative.
3. Build a strong brand identity. Your brand is how people perceive you, so make sure it's consistent across all channels. Develop a visual identity that reflects your values and personality, and use it consistently across all of your marketing materials, including your website, social media profiles, and packagi

In [20]:
from langchain.document_loaders import WebBaseLoader

web_links = ["https://www.databricks.com/","https://help.databricks.com","https://databricks.com/try-databricks","https://help.databricks.com/s/","https://docs.databricks.com","https://kb.databricks.com/","http://docs.databricks.com/getting-started/index.html","http://docs.databricks.com/introduction/index.html","http://docs.databricks.com/getting-started/tutorials/index.html","http://docs.databricks.com/release-notes/index.html","http://docs.databricks.com/ingestion/index.html","http://docs.databricks.com/exploratory-data-analysis/index.html","http://docs.databricks.com/data-preparation/index.html","http://docs.databricks.com/data-sharing/index.html","http://docs.databricks.com/marketplace/index.html","http://docs.databricks.com/workspace-index.html","http://docs.databricks.com/machine-learning/index.html","http://docs.databricks.com/sql/index.html","http://docs.databricks.com/delta/index.html","http://docs.databricks.com/dev-tools/index.html","http://docs.databricks.com/integrations/index.html","http://docs.databricks.com/administration-guide/index.html","http://docs.databricks.com/security/index.html","http://docs.databricks.com/data-governance/index.html","http://docs.databricks.com/lakehouse-architecture/index.html","http://docs.databricks.com/reference/api.html","http://docs.databricks.com/resources/index.html","http://docs.databricks.com/whats-coming.html","http://docs.databricks.com/archive/index.html","http://docs.databricks.com/lakehouse/index.html","http://docs.databricks.com/getting-started/quick-start.html","http://docs.databricks.com/getting-started/etl-quick-start.html","http://docs.databricks.com/getting-started/lakehouse-e2e.html","http://docs.databricks.com/getting-started/free-training.html","http://docs.databricks.com/sql/language-manual/index.html","http://docs.databricks.com/error-messages/index.html","http://www.apache.org/","https://databricks.com/privacy-policy","https://databricks.com/terms-of-use"] 

loader = WebBaseLoader(web_links)
documents = loader.load()

In [21]:
# describe the documents
print('type of documents:', type(documents))
print('shape of documents:', len(documents))
doc_example = documents[0]
print('doc_example :', doc_example.page_content)
print('type of doc_example:', type(doc_example))

type of documents: <class 'list'>
shape of documents: 39
doc_example : Data Lakehouse Architecture and AI Company | DatabricksSkip to main contentPlatformThe Databricks Lakehouse PlatformDelta LakeData GovernanceData EngineeringData StreamingData WarehousingData SharingMachine LearningData SciencePricingMarketplaceOpen source techSecurity and Trust CenterDiscover how to build and manage all your data, analytics and AI use cases with the Databricks Lakehouse Platform
Read nowSolutionsSolutions by IndustryFinancial ServicesHealthcare and Life SciencesManufacturingCommunications, Media & EntertainmentPublic SectorRetailSee all IndustriesSolutions by Use CaseSolution AcceleratorsProfessional ServicesDigital Native BusinessesData Platform MigrationReport

Tap the potential of AI
Explore recent findings from 600 CIOs across 14 industries in this MIT Technology Review report
Read nowLearnDocumentationTraining & CertificationDemosResourcesOnline CommunityUniversity AllianceEventsData + AI Summ

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [23]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

## Initializing Chain

In [24]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [25]:
chat_history = []

query = "What is Data lakehouse architecture in Databricks?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 In Databricks, the Medallion architecture refers to a multi-layered approach to validating, cleaning, and transforming data for analytics. This includes data catalogs, data pipelines, data warehousing, and data governance.


In [26]:
print(result['source_documents'])

[Document(page_content='What is data modeling on Databricks? \nThe Databricks Lakehouse Platform organizes data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views. Databricks recommends a multi-layer approach to validating, cleansing, and transforming data for analytics. For more information, see the medallion architecture.\n\n\nWhat is Databricks SQL? \nDatabricks SQL provides general compute resources for SQL queries, visualizations, and dashboards that are executed against the tables in the lakehouse. Within Databricks SQL, these queries, visualizations, and dashboards are developed and executed using SQL editor.', metadata={'source': 'http://docs.databricks.com/sql/index.html', 'title': 'What is data warehousing on Databricks? | Databricks on AWS', 'description': 'Learn about building a data warehousing solution in Databricks using Databricks SQL.', 'language': 'en-US'}), Document(page_content='Data governance\nLakehouse 

In [27]:
chat_history = [(query, result["answer"])]

query = "What are Data Governance and Interoperability in it?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])
chat_history.append((query, result["answer"]))

  In a Data Lakehouse architecture, Data Governance plays a crucial role in ensuring the secure management of data assets within an organization. It encompasses policies and practices that regulate how data is collected, stored, processed, and used across various departments and stakeholders. In Databricks, Data Governance is particularly important as it helps to simplify data governance by unifying data warehousing and AI use cases on a single platform. By implementing centralized data governance, organizations can better manage their data assets, reduce the risk of data breaches, and ensure compliance with regulatory requirements.


In [28]:
query = "what is databricks what solutions does it provide?"
result = chain({"question": query, "chat_history": chat_history})

In [29]:
print(result['answer'])

 According to the provided text, Databricks provides the following solutions:

1. Build an enterprise data lakehouse.
2. Important: This documentation has been retired and might not be updated. The products, services, or technologies mentioned in this content are no longer supported.
3. Unity Catalog further extends this relationship, allowing you to manage permissions for accessing data using familiar SQL syntax from within Databricks.
4. Databricks workspaces meet the security and networking requirements of some of the world’s largest and most security-minded companies.
5. Databricks includes Partner Connect, a user interface that allows validated solutions to integrate more quickly and easily with your Databricks clusters and SQL warehouses.
6. Build an integration with Databricks.


In [30]:
print(result['source_documents'])


[Document(page_content='Databricks documentation archive \n\nImportant\nThis documentation has been retired and might not be updated. The products, services, or technologies mentioned in this content are no longer supported.\n\nIn this archive, you can find earlier versions of documentation for Databricks products, features, APIs, and workflows.', metadata={'source': 'http://docs.databricks.com/archive/index.html', 'title': 'Databricks documentation archive | Databricks on AWS', 'description': 'The docs in this archive have been retired and may not be updated. The products, services, or technologies mentioned in this content are no longer supported.', 'language': 'en-US'}), Document(page_content='Unlike many enterprise data companies, Databricks does not force you to migrate your data into proprietary storage systems to use the platform. Instead, you configure a Databricks workspace by configuring secure integrations between the Databricks platform and your cloud account, and then Data

In [31]:
print(result)

{'question': 'what is databricks what solutions does it provide?', 'chat_history': [('What is Data lakehouse architecture in Databricks?', ' In Databricks, the Medallion architecture refers to a multi-layered approach to validating, cleaning, and transforming data for analytics. This includes data catalogs, data pipelines, data warehousing, and data governance.'), ('What are Data Governance and Interoperability in it?', '  In a Data Lakehouse architecture, Data Governance plays a crucial role in ensuring the secure management of data assets within an organization. It encompasses policies and practices that regulate how data is collected, stored, processed, and used across various departments and stakeholders. In Databricks, Data Governance is particularly important as it helps to simplify data governance by unifying data warehousing and AI use cases on a single platform. By implementing centralized data governance, organizations can better manage their data assets, reduce the risk of d