<a href="https://colab.research.google.com/github/HARDIK218/Large-Language-Model/blob/main/reway_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.4/794.4 kB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.2/37.2 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m84.

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

In [None]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


In [None]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [None]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
!playwright install
!playwright install-deps

Downloading Chromium 120.0.6099.28 (playwright build v1091)[2m from https://playwright.azureedge.net/builds/chromium/1091/chromium-linux.zip[22m
[1G153.1 Mb [] 0% 0.0s[0K[1G153.1 Mb [] 0% 12.8s[0K[1G153.1 Mb [] 0% 7.9s[0K[1G153.1 Mb [] 0% 6.7s[0K[1G153.1 Mb [] 1% 4.8s[0K[1G153.1 Mb [] 1% 4.3s[0K[1G153.1 Mb [] 2% 3.9s[0K[1G153.1 Mb [] 2% 4.0s[0K[1G153.1 Mb [] 3% 4.0s[0K[1G153.1 Mb [] 3% 3.6s[0K[1G153.1 Mb [] 4% 3.7s[0K[1G153.1 Mb [] 5% 3.7s[0K[1G153.1 Mb [] 5% 3.5s[0K[1G153.1 Mb [] 6% 3.4s[0K[1G153.1 Mb [] 7% 3.3s[0K[1G153.1 Mb [] 7% 3.2s[0K[1G153.1 Mb [] 8% 3.2s[0K[1G153.1 Mb [] 9% 3.1s[0K[1G153.1 Mb [] 10% 3.0s[0K[1G153.1 Mb [] 11% 3.0s[0K[1G153.1 Mb [] 12% 2.9s[0K[1G153.1 Mb [] 13% 2.8s[0K[1G153.1 Mb [] 14% 2.8s[0K[1G153.1 Mb [] 14% 2.9s[0K[1G153.1 Mb [] 15% 2.8s[0K[1G153.1 Mb [] 15% 2.9s[0K[1G153.1 Mb [] 16% 2.7s[0K[1G153.1 Mb [] 17% 2.7s[0K[1G153.1 Mb [] 18% 2.6s[0K[1G153.1 Mb [] 19% 2.5s[0K[1G153.1 Mb [] 20% 2.4s[0K[

In [None]:
# # NO NEED TO RUN AGAIN
# import nest_asyncio
# nest_asyncio.apply()

# articles=[
#     "https://en.wikipedia.org/wiki/Electronic_waste_recycling",
#     "https://www.treehugger.com/what-is-e-waste-and-why-is-it-a-problem-5186270",
#     "https://indianexpress.com/article/world/climate-change/recycling-gone-up-last-5-years-67-e-waste-remains-unprocessed-8530613/",
#     "https://recykal.com/blog/growing-concern-about-e-waste-in-india/",
#     "https://www.downtoearth.org.in/blog/waste/recycling-of-e-waste-in-india-and-its-potential-64034",
#     'https://www.sciencedirect.com/topics/earth-and-planetary-sciences/extended-producer-responsibility#:~:text=EPR%20is%20a%20policy%20strategy,and%20environmental%20impact%20in%20general',
#     'https://pib.gov.in/PressReleasePage.aspx?PRID=1799170',
#     'https://recykal.com/blog/a-guide-to-epr-compliance-in-india/',
#     'https://www.shaktiplasticinds.com/extended-producer-responsibility-in-india-epr/',
#     'https://www.ewaste1.com/what-is-e-waste/',
#     'https://www.meity.gov.in/writereaddata/files/EWaste_Sep11_892011.pdf',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/e-waste-management-in-india',
#     'https://hindrise.org/resources/e-waste-management-in-india/',
#     'https://recykal.com/blog/growing-concern-about-e-waste-in-india/',
#     'https://recykal.com/blog/how-to-safely-dispose-your-organizations-it-waste/',
#     'https://recykal.com/blog/differences-between-it-asset-disposal-vs-it-asset-disposition/',
#     'https://recykal.com/blog/why-you-need-to-meet-the-cpcb-deadline-for-filing-epr-plastic-returns/',
#     'https://recykal.com/blog/importance-of-epr-compliance-for-producers-importers-and-brand-owners/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-tyre-waste/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-e-waste/',
#     'https://recykal.com/blog/how-businesses-can-ensure-continuous-compliance-with-epr-regulations/',
#     'https://recykal.com/blog/who-should-have-an-epr-certificate/',
#     'https://recykal.com/blog/responsibilities-of-importers-for-epr-under-pwm/',
#     'https://recykal.com/blog/responsibilities-of-brands-for-epr-under-pwm/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-plastic-waste/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-battery-waste/',
#     'https://recykal.com/blog/why-every-business-must-have-an-it-asset-disposal-strategy/',
#     'https://recykal.com/blog/why-not-to-resell-your-organizations-used-it-assets/',
#     'https://recykal.com/blog/major-itam-risks-every-business-must-be-aware-of/',
#     'https://recykal.com/blog/what-are-the-benefits-of-it-asset-disposal/',
#     'https://recykal.com/blog/what-are-the-risks-of-it-asset-disposal/',
#     'https://recykal.com/blog/what-is-an-epr-certificate/',
#     'https://recykal.com/blog/what-is-the-cost-of-an-epr-certificate/',
#     'https://recykal.com/blog/how-do-i-get-an-epr-certificate/',
#     'https://recykal.com/blog/why-industrial-waste-management-is-important-for-business/',
#     'https://recykal.com/blog/why-plastic-neutrality-is-important-for-d2c-brands/',
#     'https://recykal.com/blog/all-you-need-to-know-about-plastic-neutrality/',
#     'https://recykal.com/blog/a-guide-to-epr-compliance-in-india/',
#     'https://recykal.com/blog/latest-epr-guidelines-for-tyre/',
#     'https://recykal.com/blog/why-are-all-d2c-brands-racing-towards-plastic-neutrality/',
#     'https://recykal.com/blog/exploring-the-challenges-and-opportunities-of-plastic-credits/',
#     'https://recykal.com/blog/an-introduction-to-plastic-credits/',
#     'https://recykal.com/blog/how-can-organizations-reduce-their-laptops-carbon-footprint/',
#     'https://recykal.com/blog/single-use-plastic-ban-what-will-change-from-july-1/',
#     'https://recykal.com/blog/what-is-industrial-waste/',
#     'https://recykal.com/blog/epr-and-sustainability-as-revenue-catalyst-2/',
#     'https://recykal.com/blog/pwm-rules-secondamendment-2021/',
#     'https://recykal.com/blog/pwm-rules-secondamendment-2021-2/',
#     'https://recykal.com/blog/plastic-waste-management-rules-amendment/',
#     'https://recykal.com/blog/plastic-recycling-epr-fulfillment/',
#     'https://recykal.com/blog/extended-producer-responsibility-status-around-the-world/',
#     'https://recykal.com/blog/how-is-plastic-waste-resource/',
#     'https://recykal.com/blog/epr-certification-mandatory-india/',
#     'https://recykal.com/blog/epr-registration-plastic-waste/',
#     'https://recykal.com/blog/pros-role-of-stakeholders-in-epr/',
#     'https://recykal.com/blog/recyclers-role-of-stakeholders-in-epr/',
#     'https://recykal.com/blog/an-introduction-to-e-waste/',
#     'https://recykal.com/blog/role-stakeholders-in-epr-producers-importers-brandowners/',
#     'https://recykal.com/blog/role-of-stakeholders-in-epr-consumers/',
#     'https://recykal.com/blog/stakeholders-in-epr-national-authority/',
#     'https://recykal.com/blog/global-plastic-pollution-epr/',
#     'https://recykal.com/blog/epr-challenges-in-india/',
#     'https://recykal.com/blog/waste-management-steps-bulk-waste-generators/',
#     'https://recykal.com/blog/plastic-waste-management-epr-fulfilment-2/',
#     'https://recykal.com/blog/epr-benefits-brands/',
#     'https://recykal.com/blog/cpcb-amends-sop-pibos-under-pwm-rules/',
#     'https://recykal.com/blog/data-quality-for-epr/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-action-plan-2/',
#     'https://recykal.com/blog/plastic-waste-management-epr-fulfilment/',
#     'http://swachhbharaturban.gov.in/writereaddata/SBM%20Plastic%20Waste%20Book.pdf',
#     'http://swachhbharaturban.gov.in/writereaddata/SBM%20Plastic%20Waste%20Book.pdf',
#     'https://www.shaktiplasticinds.com/how-to-manage-plastic-waste/',
#     'https://www.shaktiplasticinds.com/what-are-the-methods-of-plastic-waste-management/',
#     'https://timesofindia.indiatimes.com/blogs/voices/plastic-waste-and-its-management-strategies-for-environmental-sustainability/',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/battery-waste-management-rules-2022',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/battery-waste-management-and-handling-rules-2022',
#     'http://www.eprbatterycpcb.in/upload/adminDoc/Frequently%20Asked%20Questions%20(General).pdf',
#     'https://www.karosambhav.com/epr-compliance-for-battery-waste',
#     'https://www.corpseed.com/service/epr-for-waste-tyres',
#     'https://ssrana.in/articles/amendment-rules-extended-producer-responsibility-waste-tyre/',
#     'https://www.professionalutilities.com/epr-registration-for-tyre-waste/delhi.php',
#     'https://www.sciencedirect.com/science/article/abs/pii/S0141391021002809',
#     'https://link.springer.com/article/10.1007/s10163-022-01554-y',
#     'https://www.environmentalpollution.in/waste-management/rubber-industry/waste-from-rubber-industry-and-its-disposal/6938',
#     'https://climatepromise.undp.org/news-and-stories/what-is-circular-econ omy-and-how-it-helps-fight-climate-change',
#     'https://www.sciencedirect.com/topics/social-sciences/circular-economy',
#     'https://www.sustainability.com/thinking/creating-a-circular-economy-for-plastics/',
#     'https://www.sciencedirect.com/topics/earth-and-planetary-sciences/extended-producer-responsibility#:~:text=EPR%20is%20a%20policy%20strategy,and%20environmental%20impact%20in%20general',
#     'https://pib.gov.in/PressReleasePage.aspx?PRID=1799170',
#     'https://recykal.com/blog/a-guide-to-epr-compliance-in-india/',
#     'https://www.shaktiplasticinds.com/extended-producer-responsibility-in-india-epr/',
#     'https://www.ewaste1.com/what-is-e-waste/',
#     'https://www.meity.gov.in/writereaddata/files/EWaste_Sep11_892011.pdf',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/e-waste-management-in-india',
#     'https://hindrise.org/resources/e-waste-management-in-india/',
#     'https://recykal.com/blog/growing-concern-about-e-waste-in-india/',
#     'https://recykal.com/blog/how-to-safely-dispose-your-organizations-it-waste/',
#     'https://recykal.com/blog/differences-between-it-asset-disposal-vs-it-asset-disposition/',
#     'https://recykal.com/blog/why-you-need-to-meet-the-cpcb-deadline-for-filing-epr-plastic-returns/',
#     'https://recykal.com/blog/importance-of-epr-compliance-for-producers-importers-and-brand-owners/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-tyre-waste/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-e-waste/',
#     'https://recykal.com/blog/how-businesses-can-ensure-continuous-compliance-with-epr-regulations/',
#     'https://recykal.com/blog/who-should-have-an-epr-certificate/',
#     'https://recykal.com/blog/responsibilities-of-importers-for-epr-under-pwm/',
#     'https://recykal.com/blog/responsibilities-of-brands-for-epr-under-pwm/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-plastic-waste/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-compliance-for-battery-waste/',
#     'https://recykal.com/blog/why-every-business-must-have-an-it-asset-disposal-strategy/',
#     'https://recykal.com/blog/why-not-to-resell-your-organizations-used-it-assets/',
#     'https://recykal.com/blog/major-itam-risks-every-business-must-be-aware-of/',
#     'https://recykal.com/blog/what-are-the-benefits-of-it-asset-disposal/',
#     'https://recykal.com/blog/what-are-the-risks-of-it-asset-disposal/',
#     'https://recykal.com/blog/what-is-an-epr-certificate/',
#     'https://recykal.com/blog/what-is-the-cost-of-an-epr-certificate/',
#     'https://recykal.com/blog/how-do-i-get-an-epr-certificate/',
#     'https://recykal.com/blog/why-industrial-waste-management-is-important-for-business/',
#     'https://recykal.com/blog/why-plastic-neutrality-is-important-for-d2c-brands/',
#     'https://recykal.com/blog/all-you-need-to-know-about-plastic-neutrality/',
#     'https://recykal.com/blog/a-guide-to-epr-compliance-in-india/',
#     'https://recykal.com/blog/latest-epr-guidelines-for-tyre/',
#     'https://recykal.com/blog/why-are-all-d2c-brands-racing-towards-plastic-neutrality/',
#     'https://recykal.com/blog/exploring-the-challenges-and-opportunities-of-plastic-credits/',
#     'https://recykal.com/blog/an-introduction-to-plastic-credits/',
#     'https://recykal.com/blog/how-can-organizations-reduce-their-laptops-carbon-footprint/',
#     'https://recykal.com/blog/single-use-plastic-ban-what-will-change-from-july-1/',
#     'https://recykal.com/blog/what-is-industrial-waste/',
#     'https://recykal.com/blog/epr-and-sustainability-as-revenue-catalyst-2/',
#     'https://recykal.com/blog/pwm-rules-secondamendment-2021/',
#     'https://recykal.com/blog/pwm-rules-secondamendment-2021-2/',
#     'https://recykal.com/blog/plastic-waste-management-rules-amendment/',
#     'https://recykal.com/blog/plastic-recycling-epr-fulfillment/',
#     'https://recykal.com/blog/extended-producer-responsibility-status-around-the-world/',
#     'https://recykal.com/blog/how-is-plastic-waste-resource/',
#     'https://recykal.com/blog/epr-certification-mandatory-india/',
#     'https://recykal.com/blog/epr-registration-plastic-waste/',
#     'https://recykal.com/blog/pros-role-of-stakeholders-in-epr/',
#     'https://recykal.com/blog/recyclers-role-of-stakeholders-in-epr/',
#     'https://recykal.com/blog/an-introduction-to-e-waste/',
#     'https://recykal.com/blog/role-stakeholders-in-epr-producers-importers-brandowners/',
#     'https://recykal.com/blog/role-of-stakeholders-in-epr-consumers/',
#     'https://recykal.com/blog/stakeholders-in-epr-national-authority/',
#     'https://recykal.com/blog/global-plastic-pollution-epr/',
#     'https://recykal.com/blog/epr-challenges-in-india/',
#     'https://recykal.com/blog/waste-management-steps-bulk-waste-generators/',
#     'https://recykal.com/blog/plastic-waste-management-epr-fulfilment-2/',
#     'https://recykal.com/blog/epr-benefits-brands/',
#     'https://recykal.com/blog/cpcb-amends-sop-pibos-under-pwm-rules/',
#     'https://recykal.com/blog/data-quality-for-epr/',
#     'https://recykal.com/blog/all-you-need-to-know-about-epr-action-plan-2/',
#     'https://recykal.com/blog/plastic-waste-management-epr-fulfilment/',
#     'http://swachhbharaturban.gov.in/writereaddata/SBM%20Plastic%20Waste%20Book.pdf',
#     'http://swachhbharaturban.gov.in/writereaddata/SBM%20Plastic%20Waste%20Book.pdf',
#     'https://www.shaktiplasticinds.com/how-to-manage-plastic-waste/',
#     'https://www.shaktiplasticinds.com/what-are-the-methods-of-plastic-waste-management/',
#     'https://timesofindia.indiatimes.com/blogs/voices/plastic-waste-and-its-management-strategies-for-environmental-sustainability/',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/battery-waste-management-rules-2022',
#     'https://www.drishtiias.com/daily-updates/daily-news-analysis/battery-waste-management-and-handling-rules-2022',
#     'http://www.eprbatterycpcb.in/upload/adminDoc/Frequently%20Asked%20Questions%20(General).pdf',
#     'https://www.karosambhav.com/epr-compliance-for-battery-waste',
#     'https://www.corpseed.com/service/epr-for-waste-tyres',
#     'https://ssrana.in/articles/amendment-rules-extended-producer-responsibility-waste-tyre/',
#     'https://www.professionalutilities.com/epr-registration-for-tyre-waste/delhi.php',
#     'https://www.sciencedirect.com/science/article/abs/pii/S0141391021002809',
#     'https://link.springer.com/article/10.1007/s10163-022-01554-y',
#     'https://www.environmentalpollution.in/waste-management/rubber-industry/waste-from-rubber-industry-and-its-disposal/6938',
#     'https://climatepromise.undp.org/news-and-stories/what-is-circular-economy-and-how-it-helps-fight-climate-change',
#     'https://www.sciencedirect.com/topics/social-sciences/circular-economy',
#     'https://www.sustainability.com/thinking/creating-a-circular-economy-for-plastics/'
# ]

# loader = AsyncChromiumLoader(articles)
# docs = loader.load()

In [None]:
# #NO NEED TO RUN AGAIN
# # Converts HTML to plain text
# html2text = Html2TextTransformer()
# docs_transformed = html2text.transform_documents(docs)

# # Chunk text
# text_splitter = CharacterTextSplitter(chunk_size=200,
#                                       chunk_overlap=0)
# chunked_documents = text_splitter.split_documents(docs_transformed)

# # Load chunked documents into the FAISS index
# # db = FAISS.from_documents(chunked_documents,
# #                           HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

# # retriever = db.as_retriever()

In [None]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m174.1/179.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting loguru>=0.5.0 (from pinecone-client)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting dnspython>=2.0.0 (from pinecone-client)
  Downloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: loguru, dnspython, pinecone-client
Successfully insta

In [None]:
from langchain.vectorstores import Pinecone
import pinecone
pinecone.init(
    api_key='e081611d-59ab-42d4-adc7-a0ce304b22db',
    environment='gcp-starter'
)
index_name = 'reway-mistral-llm'

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.07957,
 'namespaces': {'': {'vector_count': 7957}},
 'total_vector_count': 7957}

In [None]:
embed = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-mpnet-base-v2'
    )
text_field="text"

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)
retriever = vectorstore.as_retriever()



In [None]:
# # DO NOT RUN AGAIN
# docsearch = Pinecone.from_documents(chunked_documents, HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'), index_name=index_name)
# retriever = docsearch.as_retriever()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# query = 'What is the importance of e-waste?'
# docs = docsearch.similarity_search(query, include_metadeta=False)

In [None]:
prompt_template = """
### [INST] Instruction: Answer the question based on your e-waste knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
"""

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [None]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke("How can I recycle e-waste?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
result['context']

[Document(page_content='How to properly recycle electronics\n\nData Liability and security\n\nHow to Prepare your Business Electronics for Recycling\n\nWhat is e-waste? And how do we dispose of it?', metadata={'source': 'https://www.ewaste1.com/what-is-e-waste/'}),
 Document(page_content='How to properly recycle electronics\n\nData Liability and security\n\nHow to Prepare your Business Electronics for Recycling\n\nWhat is e-waste? And how do we dispose of it?', metadata={'source': 'https://www.ewaste1.com/what-is-e-waste/'}),
 Document(page_content='## How is E-waste in India recycled?', metadata={'source': 'https://recykal.com/blog/growing-concern-about-e-waste-in-india/'}),
 Document(page_content='## How is E-waste in India recycled?', metadata={'source': 'https://recykal.com/blog/growing-concern-about-e-waste-in-india/'})]

In [None]:
print(result['text'])


To recycle e-waste, you should first gather all of your electronic devices and remove any batteries or other removable components. Then, you can either take your e-waste to a certified e-waste recycling facility or dispose of it through an authorized e-waste collection program. It's important to ensure that your e-waste is properly handled and disposed of to prevent environmental harm and protect your personal data.
