In [1]:
# !pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers


In [2]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.95s/it]


Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [4]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [5]:
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids


[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [6]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [7]:
generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [8]:
# res = generate_text("Explain me the difference between Data Lakehouse and Data Warehouse.")
# print(res[0]["generated_text"])

# Implementing HF Pipeline in LangChain

In [9]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

# checking again that everything is working fine
text = llm(prompt="How to succeed in entrepreneurship?")
print(text)



 everybody has a different definition of success, but here are some common traits and strategies that successful entrepreneurs tend to have:

1. Passion and drive: Successful entrepreneurs are often driven by a deep passion for their business or industry. They have a clear vision for what they want to achieve and are willing to put in the hard work necessary to make it happen.
2. Adaptability: The ability to adapt quickly is crucial for entrepreneurs. As markets change and new challenges arise, successful entrepreneurs are able to pivot and adjust their strategy accordingly.
3. Resilience: Entrepreneurship can be a difficult and unpredictable journey. Successful entrepreneurs have the resilience to bounce back from setbacks and failures, learning from each experience and using it as an opportunity for growth.
4. Networking skills: Building a strong network of contacts and connections is essential for entrepreneurs. Successful entrepreneurs know how to build and leverage these networks

In [10]:
from langchain.document_loaders import WebBaseLoader

web_links = ["https://www.databricks.com/","https://help.databricks.com","https://databricks.com/try-databricks","https://help.databricks.com/s/","https://docs.databricks.com","https://kb.databricks.com/","http://docs.databricks.com/getting-started/index.html","http://docs.databricks.com/introduction/index.html","http://docs.databricks.com/getting-started/tutorials/index.html","http://docs.databricks.com/release-notes/index.html","http://docs.databricks.com/ingestion/index.html","http://docs.databricks.com/exploratory-data-analysis/index.html","http://docs.databricks.com/data-preparation/index.html","http://docs.databricks.com/data-sharing/index.html","http://docs.databricks.com/marketplace/index.html","http://docs.databricks.com/workspace-index.html","http://docs.databricks.com/machine-learning/index.html","http://docs.databricks.com/sql/index.html","http://docs.databricks.com/delta/index.html","http://docs.databricks.com/dev-tools/index.html","http://docs.databricks.com/integrations/index.html","http://docs.databricks.com/administration-guide/index.html","http://docs.databricks.com/security/index.html","http://docs.databricks.com/data-governance/index.html","http://docs.databricks.com/lakehouse-architecture/index.html","http://docs.databricks.com/reference/api.html","http://docs.databricks.com/resources/index.html","http://docs.databricks.com/whats-coming.html","http://docs.databricks.com/archive/index.html","http://docs.databricks.com/lakehouse/index.html","http://docs.databricks.com/getting-started/quick-start.html","http://docs.databricks.com/getting-started/etl-quick-start.html","http://docs.databricks.com/getting-started/lakehouse-e2e.html","http://docs.databricks.com/getting-started/free-training.html","http://docs.databricks.com/sql/language-manual/index.html","http://docs.databricks.com/error-messages/index.html","http://www.apache.org/","https://databricks.com/privacy-policy","https://databricks.com/terms-of-use"] 

loader = WebBaseLoader(web_links)
documents = loader.load()

In [11]:
# describe the documents
print('type of documents:', type(documents))
print('shape of documents:', len(documents))
doc_example = documents[0]
print('doc_example :', doc_example.page_content)
print('type of doc_example:', type(doc_example))

type of documents: <class 'list'>
shape of documents: 39
doc_example : Data Lakehouse Architecture and AI Company | DatabricksSkip to main contentPlatformThe Databricks Lakehouse PlatformDelta LakeData GovernanceData EngineeringData StreamingData WarehousingData SharingMachine LearningData SciencePricingMarketplaceOpen source techSecurity and Trust CenterDiscover how to build and manage all your data, analytics and AI use cases with the Databricks Lakehouse Platform
Read nowSolutionsSolutions by IndustryFinancial ServicesHealthcare and Life SciencesManufacturingCommunications, Media & EntertainmentPublic SectorRetailSee all IndustriesSolutions by Use CaseSolution AcceleratorsProfessional ServicesDigital Native BusinessesData Platform MigrationReport

Tap the potential of AI
Explore recent findings from 600 CIOs across 14 industries in this MIT Technology Review report
Read nowLearnDocumentationTraining & CertificationDemosResourcesOnline CommunityUniversity AllianceEventsData + AI Summ

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [23]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

## Initializing Chain

In [25]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [26]:
chat_history = []

query = "What is Data lakehouse architecture in Databricks?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 The medallion lakehouse architecture is a multi-layer approach to validating, cleansing, and transforming data for analytics. It includes the following layers:
1. Data ingestion layer: This layer handles the ingestion of raw data from various sources into the lakehouse.
2. Data storage layer: This layer stores the ingested data in a scalable and durable manner, using technologies such as Apache Hadoop and Amazon S3.
3. Data processing layer: This layer processes the stored data using various techniques, including data transformation, aggregation, and filtering.
4. Data governance layer: This layer enforces data governance policies and standards across the organization, ensuring data quality and consistency.
5. Data visualization and analytics layer: This layer provides tools for visualizing and analyzing the processed data, enabling insights and decision-making.
6. Machine learning and AI layer: This layer enables the use of machine learning and artificial intelligence techniques on t

In [32]:
print(result['source_documents'])

[Document(page_content='What is data modeling on Databricks? \nThe Databricks Lakehouse Platform organizes data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views. Databricks recommends a multi-layer approach to validating, cleansing, and transforming data for analytics. For more information, see the medallion architecture.\n\n\nWhat is Databricks SQL? \nDatabricks SQL provides general compute resources for SQL queries, visualizations, and dashboards that are executed against the tables in the lakehouse. Within Databricks SQL, these queries, visualizations, and dashboards are developed and executed using SQL editor.', metadata={'source': 'http://docs.databricks.com/sql/index.html', 'title': 'What is data warehousing on Databricks? | Databricks on AWS', 'description': 'Learn about building a data warehousing solution in Databricks using Databricks SQL.', 'language': 'en-US'}), Document(page_content='Data governance\nLakehouse 

In [33]:
chat_history = [(query, result["answer"])]

query = "What are Data Governance and Interoperability in it?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])
chat_history.append((query, result["answer"]))

  Of course! In a Data Lakehouse architecture, data governance plays a crucial role in ensuring the quality, security, and compliance of data. It involves establishing policies, procedures, and standards for managing data across various stakeholders, including data creators, data consumers, and data stewards.
Some key aspects of data governance in a Data Lakehouse include:

* Data quality management: Ensuring that data is accurate, complete, and consistent across different sources and systems.
* Data security management: Implementing access controls, encryption, and other security measures to protect sensitive data from unauthorized access or breaches.
* Compliance management: Ensuring that data is handled in accordance with relevant laws, regulations, and industry standards, such as GDPR, HIPAA, or CCPA.
* Data lineage management: Tracking the origin, movement, and usage of data throughout its lifecycle to maintain transparency and accountability.
* Data catalog management: Creating a

In [34]:
query = "what is databricks what solutions does it provide?"
result = chain({"question": query, "chat_history": chat_history})

In [35]:
print(result['answer'])

 Databricks is a cloud-based platform that provides various solutions for data engineering, data warehousing, and data science. Its main features include data ingestion, data transformation, data visualization, and data governance. Additionally, Databricks offers a range of tools and services for building and deploying machine learning models, as well as integrating with popular programming languages such as Python and R. Overall, Databricks aims to simplify the process of working with large datasets and enabling organizations to extract insights and value from their data.


In [36]:
print(result['source_documents'])


[Document(page_content='Databricks provides a number of custom tools for data ingestion, including Auto Loader, an efficient and scalable tool for incrementally and idempotently loading data from cloud object storage and data lakes into the data lakehouse.', metadata={'source': 'http://docs.databricks.com/introduction/index.html', 'title': 'What is Databricks? | Databricks on AWS', 'description': '‘Learn what Databricks is and what it is used for: tools and use cases of the Databricks Lakehouse Platform.’', 'language': 'en-US'}), Document(page_content='What is data modeling on Databricks? \nThe Databricks Lakehouse Platform organizes data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views. Databricks recommends a multi-layer approach to validating, cleansing, and transforming data for analytics. For more information, see the medallion architecture.\n\n\nWhat is Databricks SQL? \nDatabricks SQL provides general compute resourc

In [37]:
print(result)

{'question': 'what is databricks what solutions does it provide?', 'chat_history': [('What is Data lakehouse architecture in Databricks?', ' The medallion lakehouse architecture is a multi-layer approach to validating, cleansing, and transforming data for analytics. It includes the following layers:\n1. Data ingestion layer: This layer handles the ingestion of raw data from various sources into the lakehouse.\n2. Data storage layer: This layer stores the ingested data in a scalable and durable manner, using technologies such as Apache Hadoop and Amazon S3.\n3. Data processing layer: This layer processes the stored data using various techniques, including data transformation, aggregation, and filtering.\n4. Data governance layer: This layer enforces data governance policies and standards across the organization, ensuring data quality and consistency.\n5. Data visualization and analytics layer: This layer provides tools for visualizing and analyzing the processed data, enabling insights 