In [None]:
import torch 
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline


Loadding the multilingual speech recognition model Multilingual Whisper-v3


In [None]:
device='cuda:0' if torch.cuda.is_available() else 'cpu'
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32

model_id_whisper = "openai/whisper-large-v3"

model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id_whisper, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model_whisper.to(device)

processor_whisper = AutoProcessor.from_pretrained(model_id_whisper)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model_whisper,
    tokenizer=processor_whisper.tokenizer,
    feature_extractor=processor_whisper.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input file 

In [None]:
# convert the input video to audio
import moviepy.editor as me
video=me.VideoFileClip(r"E:\TensorGo Assigment\test_video\tamil_audio.mp4")
audio=video.audio
audio.write_audiofile('audio.mp3')

MoviePy - Writing audio in audio.mp3


                                                                                                                       

MoviePy - Done.




In [5]:
#cant load the file in the model so slpiting them into paths and array
import librosa
import numpy as np

file_path = 'audio.mp3'
audio_data, sampling_rate = librosa.load(file_path, sr=16000)  # sr=None to preserve original sampling rate

audio_info = {
    'path': file_path,
    'array': audio_data,
    'sampling_rate': sampling_rate
}



Extracting text 

In [None]:

result = pipe(audio_info,generate_kwargs={"task": "translate"})
print(result["text"])
text=result["text"]

 There should be a goal in life There should be a goal in life You can achieve anything There should be a goal in life for that The book is to search for the knowledge that is between the goals There should be a good habit You should not be afraid of problems You should be a part of the problem There should be good people in life If you have them, you will definitely succeed


Embedding the data

In [42]:
#rag implementaion 

from nltk.tokenize import sent_tokenize

In [46]:
sentences_of_text=sent_tokenize(text)

In [49]:
num_sentence_chunk_size = 10 
def split_list(input_list ,slice_size) :
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]
sentence_chunks = split_list(sentences_of_text,num_sentence_chunk_size)
num_chunks = len(sentence_chunks)

In [50]:
sentence_chunks

[[' The British came to India for trading.',
  "They made their offices and forts in various parts of India But eventually the British East India Company became the major force in India the company's troops led by Robert Clive defeated Siraj Udulla the ruler of Bengal in 1757 in the battle of policy and that was the beginning of British rule known as British Raj in India.",
  'Under the British rule, the Indians were deprived of basic needs, socially and economically and legally discriminated against, misbehaved, mistreated and tortured.',
  "India's first war of independence was a revolt of Indian soldiers and rulers against British rule.",
  'The rebellion by Indian troops of the British Raj started in May 1857 and continued until December 1858.',
  "During this period of India's first war of independence, many social and religious leaders worked to inspire the Indian society.",
  'They included men like Swami Vivekananda, Ramkrishnadeva, Sri Aurobindo, Subramanian Bharti, Bumkin Cha

In [53]:
chunks = []
import re
for sentence_chunk in sentence_chunks:
    chunk_dict = {}
    joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
    joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) 
    chunk_dict["sentence_chunk"] = joined_sentence_chunk
    chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
    chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
    chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 
    chunks.append(chunk_dict)
len(chunks)

4

In [54]:
chunks

[{'sentence_chunk': "The British came to India for trading. They made their offices and forts in various parts of India But eventually the British East India Company became the major force in India the company's troops led by Robert Clive defeated Siraj Udulla the ruler of Bengal in 1757 in the battle of policy and that was the beginning of British rule known as British Raj in India. Under the British rule, the Indians were deprived of basic needs, socially and economically and legally discriminated against, misbehaved, mistreated and tortured. India's first war of independence was a revolt of Indian soldiers and rulers against British rule. The rebellion by Indian troops of the British Raj started in May 1857 and continued until December 1858. During this period of India's first war of independence, many social and religious leaders worked to inspire the Indian society. They included men like Swami Vivekananda, Ramkrishnadeva, Sri Aurobindo, Subramanian Bharti, Bumkin Chandra Chatterj

In [56]:
import pandas as pd
df = pd.DataFrame(chunks)
df.describe().round(2)

Unnamed: 0,chunk_char_count,chunk_word_count,chunk_token_count
count,4.0,4.0,4.0
mean,971.75,157.5,242.94
std,260.07,39.7,65.02
min,776.0,127.0,194.0
25%,804.5,133.75,201.12
50%,883.0,144.0,220.75
75%,1050.25,167.75,262.56
max,1345.0,215.0,336.25


In [59]:
min_token_length = 30
chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
chunks_over_min_token_len

[{'sentence_chunk': "The British came to India for trading. They made their offices and forts in various parts of India But eventually the British East India Company became the major force in India the company's troops led by Robert Clive defeated Siraj Udulla the ruler of Bengal in 1757 in the battle of policy and that was the beginning of British rule known as British Raj in India. Under the British rule, the Indians were deprived of basic needs, socially and economically and legally discriminated against, misbehaved, mistreated and tortured. India's first war of independence was a revolt of Indian soldiers and rulers against British rule. The rebellion by Indian troops of the British Raj started in May 1857 and continued until December 1858. During this period of India's first war of independence, many social and religious leaders worked to inspire the Indian society. They included men like Swami Vivekananda, Ramkrishnadeva, Sri Aurobindo, Subramanian Bharti, Bumkin Chandra Chatterj

In [55]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cpu")



In [60]:
%%time
embedding_model.to("cuda")
for i in tqdm(chunks_over_min_token_len):
    i["embedding"] = embedding_model.encode(i["sentence_chunk"])

  0%|          | 0/4 [00:00<?, ?it/s]

CPU times: total: 2.69 s
Wall time: 3.62 s


In [61]:
text_chunks_and_embeddings_df = pd.DataFrame(chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [95]:
import random
import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

text_chunks_and_embedding_df = pd.read_csv("text_chunks_and_embeddings_df.csv")

text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

chunks = text_chunks_and_embedding_df.to_dict(orient="records")
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([4, 768])

In [63]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device=device)



Retrieving answer form the embeddings

In [65]:
query = "when india got freedom"
print(f"Query: {query}")
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
from time import perf_counter as timer
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()
print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")
top_results_dot_product = torch.topk(dot_scores,k=1)
top_results_dot_product

Query: when india got freedom
Time take to get scores on 4 embeddings: 0.00084 seconds.


torch.return_types.topk(
values=tensor([0.6329], device='cuda:0'),
indices=tensor([1], device='cuda:0'))

In [96]:
def retriever_score(query,embeddings):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
    scores, indices = torch.topk(input=dot_scores, k=4)
    return scores,indices
def retriever(query,embedding):
    scores,indeices=retriever_score(query,embedding)
    for score, index in zip(scores, indeices):
        
        print(chunks[index]["sentence_chunk"])

In [85]:
retriever(query,embeddings)

Many Indians, including the princes and rich people of India, contributed money and materials to the war funds of the United Kingdom, anticipating change of dominion status and home rule in return. The World War I ended in 1919 and Britain won with the help of Indian soldiers. India was denied its promised reward. Many Indian soldiers died in foreign lands. In India, flu spread like an epidemic killing many people. The tax rates increased in India and prices also increased. As a result, the Indians became restless. Gandhiji returned to India from South Africa in 1915. Under the leadership of Gandhiji, Indians began to use a different method of gate freedom over the next few years. In December 1929, the Indian National Congress Party agreed to start a movement for complete independence from British rule.
As a result of many movements in small and large scale of several years, sacrifice of the heroes of Mother India finally, finally, on midnight of 15 15 August 1947, Britain handed India

Loading gemma-2b-it LLM model 

In [88]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [91]:
model_id = "google/gemma-2b-it"
attn_implementation = "sdpa"

In [93]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config ,
                                                 low_cpu_mem_usage=False, # use full memory 
                                                 attn_implementation=attn_implementation)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [94]:
llm_model


GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [101]:
def prompt_formatter_rag(query,context_items):
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""
    base_prompt = base_prompt.format(context=context, query=query)
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [102]:
def rag_answers(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    scores, indices = retriever_score(query,embeddings)
    context_items = [chunks[i] for i in indices]
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()
    prompt = prompt_formatter_rag(query,context_items)
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")   
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    output_text = tokenizer.decode(outputs[0])
    if format_answer_text:
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")
    if return_answer_only:
        return output_text
    
    return output_text, context_items

Pssing the retrived data to model and getting the answers

In [103]:
query = 'wher india got freedom'
print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = rag_answers(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print(answer)

Query: wher india got freedom
[INFO] Time taken to get scores on 4 embeddings: 0.00018 seconds.
Answer:

According to the context, India was denied its promised reward for its contribution to the war funds of the United Kingdom.


In [104]:
def prompt_formatter_summ(Text):
    base_prompt = """Based on the following text, please give the summary of the text.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
\nNow use the following text to summarize the text:
{text}
Answer:"""
    base_prompt = base_prompt.format(text=text)
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [107]:
def summarizer(temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    
    prompt = prompt_formatter_summ(result['text'])
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")   
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    output_text = tokenizer.decode(outputs[0])
    if format_answer_text:
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")
    if return_answer_only:
        return output_text
    
    return output_text

Summary of the file 

In [108]:
answer=summarizer()
print(answer)

Sure, here's the summary of the text:

The British came to India for trading and established offices and forts throughout the country. During British rule, the Indians were deprived of basic needs, socially and economically and legally discriminated against, and subjected to mistreatment and torture. However, the Indian National Congress party led by Gandhiji emerged and fought for India's independence. India finally gained its independence on August 15, 1947, after a long struggle against British rule.
