In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.4/794.4 kB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.2/37.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m32.0 

In [3]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

In [4]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


In [6]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

In [7]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [8]:
!playwright install
!playwright install-deps

Downloading Chromium 120.0.6099.28 (playwright build v1091)[2m from https://playwright.azureedge.net/builds/chromium/1091/chromium-linux.zip[22m
[1G153.1 Mb [] 0% 0.0s[0K[1G153.1 Mb [] 0% 22.5s[0K[1G153.1 Mb [] 0% 13.0s[0K[1G153.1 Mb [] 0% 7.9s[0K[1G153.1 Mb [] 0% 6.4s[0K[1G153.1 Mb [] 1% 5.2s[0K[1G153.1 Mb [] 1% 4.8s[0K[1G153.1 Mb [] 2% 4.3s[0K[1G153.1 Mb [] 3% 4.4s[0K[1G153.1 Mb [] 4% 3.9s[0K[1G153.1 Mb [] 5% 3.8s[0K[1G153.1 Mb [] 5% 3.5s[0K[1G153.1 Mb [] 6% 3.4s[0K[1G153.1 Mb [] 6% 3.5s[0K[1G153.1 Mb [] 7% 3.2s[0K[1G153.1 Mb [] 8% 3.2s[0K[1G153.1 Mb [] 9% 3.0s[0K[1G153.1 Mb [] 10% 3.0s[0K[1G153.1 Mb [] 11% 2.9s[0K[1G153.1 Mb [] 12% 2.8s[0K[1G153.1 Mb [] 13% 2.7s[0K[1G153.1 Mb [] 14% 2.7s[0K[1G153.1 Mb [] 15% 2.7s[0K[1G153.1 Mb [] 16% 2.7s[0K[1G153.1 Mb [] 17% 2.6s[0K[1G153.1 Mb [] 18% 2.5s[0K[1G153.1 Mb [] 19% 2.4s[0K[1G153.1 Mb [] 20% 2.3s[0K[1G153.1 Mb [] 21% 2.2s[0K[1G153.1 Mb [] 22% 2.2s[0K[1G153.1 Mb [] 23% 2.1s[0K

In [19]:
import pandas as pd
from bs4 import BeautifulSoup
from html2text import HTML2Text

# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = '/content/drive/MyDrive/data/chori data - Sheet1.csv'

# Replace 'your_column_name' with the name of the column you want to extract
column_name = 'Comments'

# Read the first 500 rows of the CSV file into a DataFrame
df = pd.read_csv(csv_file_path, nrows=500)

# Create a list to store transformed content
docs_transformed = []

# Create an HTML2Text object
html2text = HTML2Text()

# Extract data from the specified column into a list
for index, row in df.iterrows():
    html_content = row[column_name]
    soup = BeautifulSoup(html_content, 'html.parser')
    plain_text = html2text.handle(str(soup))
    docs_transformed.append(plain_text)

class PageContentWrapper:
    def __init__(self, page_content, metadata={}):
        self.page_content = page_content
        self.metadata = metadata

# Assuming plain_text is the content you want to chunk
docs_transformed_wrapped = [PageContentWrapper(content) for content in docs_transformed]

# Now use docs_transformed_wrapped with CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed_wrapped)


In [20]:
# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [32]:
# Create prompt template
prompt_template = """
### [INST] Instruction: Answer the question based on your docs knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [33]:
# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

In [41]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke("will bjp win 2024 election?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [42]:
result['context']

[Document(page_content='nl u have lost the charm repeating every time the same thing see yourself what\nyou used to show 2 3 year back and now only you are behind modi and anjana a\nif they are only thing left to show'),
 Document(page_content="as a common citizen all i want is a magic which can reform india bcz\npoliticians are extramely selfish lying and cheating poor indian people we\nneed to give bribe even to get community certificate or ration card no safety\nfor children and women am from tn i wish if bjp fully rules tn than our fate\nmay change but surely people wo n't vote bcz still they badly trust state\npolitical parties god please save tn and india who ever win pls honestly wrk\nfor our nation"),
 Document(page_content='worst pm in the world nomorenamo no single individual authority or system is\nleft in india ( banking cbi ed ec president is one more dummy supreme court\ncheif justice facing serialbiggest chor calls opposition anti nationalist to\nexploit kashmir issue ec

In [43]:
text = result['text']

# Add line breaks to format the text as a paragraph
formatted_text = text.replace('\n', ' ')  # Replace existing line breaks with spaces
formatted_text = formatted_text.replace('. ', '.\n\n')  # Add double line breaks after periods

# Print the formatted text
print(formatted_text)

 Based on the provided documents, it is difficult to predict with certainty whether BJP will win the 2024 election.

However, some of the documents suggest that there may be support for BJP among certain groups of people.

For example, one document mentions that the speaker of the Tamil Nadu assembly believes that if BJP fully rules Tamil Nadu, the fate of the state may change.

Another document suggests that some people may not vote due to their distrust of political parties, regardless of who wins the election.

Additionally, some documents express dissatisfaction with the current government and its policies, which could potentially lead to support for alternative parties.
