In [62]:
from langchain import PromptTemplate 
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyMuPDFLoader,DirectoryLoader
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [63]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [64]:
extracted_data = load_pdf('data/')

In [65]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=200,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    
    return text_chunks

In [66]:
text_chunks=text_split(extracted_data)
print(len(text_chunks))

7313


In [67]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

#download embedding model
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name=model_name) 
    return embeddings


In [68]:
embeddings=download_hugging_face_embeddings()

In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [None]:
from langchain.vectorstores import FAISS

DB_FAISS_PATH = "vectorstores/db_faiss"
db = FAISS.from_documents(text_chunks, embeddings)
db.save_local(DB_FAISS_PATH)

FINE TUNING T-5 SMALL MODEL ON CSV DATA

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
csv_file_path = 'intents_data/amod_mental_health_convo_train.csv'
csv_data = pd.read_csv(csv_file_path)

csv_data.dropna(subset=['Context', 'Response'], inplace=True)
print(csv_data.head(10))


                                             Context  \
0  I'm going through some things with my feelings...   
1  I'm going through some things with my feelings...   
2  I'm going through some things with my feelings...   
3  I'm going through some things with my feelings...   
4  I'm going through some things with my feelings...   
5  I'm going through some things with my feelings...   
6  I'm going through some things with my feelings...   
7  I'm going through some things with my feelings...   
8  I'm going through some things with my feelings...   
9  I'm going through some things with my feelings...   

                                            Response  
0  If everyone thinks you're worthless, then mayb...  
1  Hello, and thank you for your question and see...  
2  First thing I'd suggest is getting the sleep y...  
3  Therapy is essential for those that are feelin...  
4  I first want to let you know that you are not ...  
5  Heck, sure thing, hun!Feelings of 'depression'... 

In [4]:
dataset = Dataset.from_pandas(csv_data)

In [6]:
dataset_dict = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_dict["train"]
eval_dataset = dataset_dict["test"]

In [7]:
input_column = "Context"
target_column = "Response"
model_name = "google-t5/t5-small"

In [9]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [10]:
def preprocess_function(examples):
    inputs = examples[input_column]
    targets = examples[target_column]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2806 [00:00<?, ? examples/s]



Map:   0%|          | 0/702 [00:00<?, ? examples/s]

In [None]:
print("Example from tokenized_train_dataset:")
print(tokenized_train_dataset[0])

In [None]:
print("Example from tokenized_eval_dataset:")
print(tokenized_eval_dataset[0])

In [14]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

peft_model = get_peft_model(model, lora_config)

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [15]:
training_args = TrainingArguments(
    output_dir="./output",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [16]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

In [17]:
trainer.train()

  0%|          | 0/1755 [00:00<?, ?it/s]

  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 4.9701151847839355, 'eval_runtime': 279.922, 'eval_samples_per_second': 2.508, 'eval_steps_per_second': 0.314, 'epoch': 1.0}
{'loss': 5.8242, 'grad_norm': 0.5609720945358276, 'learning_rate': 1.4301994301994305e-05, 'epoch': 1.42}




  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 4.030130863189697, 'eval_runtime': 285.691, 'eval_samples_per_second': 2.457, 'eval_steps_per_second': 0.308, 'epoch': 2.0}
{'loss': 4.2704, 'grad_norm': 2.343942880630493, 'learning_rate': 8.603988603988605e-06, 'epoch': 2.85}




  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 3.593163013458252, 'eval_runtime': 283.0366, 'eval_samples_per_second': 2.48, 'eval_steps_per_second': 0.311, 'epoch': 3.0}


  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 3.519881010055542, 'eval_runtime': 180.9188, 'eval_samples_per_second': 3.88, 'eval_steps_per_second': 0.486, 'epoch': 4.0}
{'loss': 3.9217, 'grad_norm': 0.6347969770431519, 'learning_rate': 2.9059829059829063e-06, 'epoch': 4.27}




  0%|          | 0/88 [00:00<?, ?it/s]

{'eval_loss': 3.5053179264068604, 'eval_runtime': 182.2619, 'eval_samples_per_second': 3.852, 'eval_steps_per_second': 0.483, 'epoch': 5.0}
{'train_runtime': 11679.0112, 'train_samples_per_second': 1.201, 'train_steps_per_second': 0.15, 'train_loss': 4.558935407763533, 'epoch': 5.0}


TrainOutput(global_step=1755, training_loss=4.558935407763533, metrics={'train_runtime': 11679.0112, 'train_samples_per_second': 1.201, 'train_steps_per_second': 0.15, 'total_flos': 481066745856000.0, 'train_loss': 4.558935407763533, 'epoch': 5.0})

In [42]:
model.save_pretrained('./results/fine_tuned_t5_small')
tokenizer.save_pretrained('./results/fine_tuned_t5_small')

('./results/fine_tuned_t5_small\\tokenizer_config.json',
 './results/fine_tuned_t5_small\\special_tokens_map.json',
 './results/fine_tuned_t5_small\\spiece.model',
 './results/fine_tuned_t5_small\\added_tokens.json')

In [18]:
import os
results_dir = './results/fine_tuned_t5_small'
os.makedirs(results_dir, exist_ok=True)

In [19]:
peft_model.save_pretrained(results_dir)
tokenizer.save_pretrained(results_dir)



('./results/fine_tuned_t5_small\\tokenizer_config.json',
 './results/fine_tuned_t5_small\\special_tokens_map.json',
 './results/fine_tuned_t5_small\\spiece.model',
 './results/fine_tuned_t5_small\\added_tokens.json',
 './results/fine_tuned_t5_small\\tokenizer.json')

In [34]:

model_path = "./results/fine_tuned_t5_small"
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(model_path)
fine_tuned_tokenizer = T5TokenizerFast.from_pretrained(model_path)


def generate_response(input_text, max_length=50):
    input_ids = fine_tuned_tokenizer.encode(input_text, return_tensors="pt")
    output_ids = fine_tuned_model.generate(input_ids, max_length=max_length, do_sample=True)
    response = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response


test_input = "How can I overcome anxiety?"
generated_response = generate_response(test_input)
print(generated_response)





In [35]:
from transformers import pipeline
generator = pipeline("text2text-generation", model='./results/fine_tuned_t5_small', tokenizer=tokenizer)

# Test the model with a sample input
response = generator("What should I do if I feel anxious?")
print(f"Generated response: {response}")



Generated response: [{'generated_text': 'Was sollte ich tun, wenn ich mich ängst mache?'}]


In [37]:
prompt="""
Use the following piece of code to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [39]:
from langchain.prompts import PromptTemplate

template = PromptTemplate(
    input_variables=["context"],
    template="You are a mental health assistant. Respond to the following query in a supportive and understanding manner:\n\nContext: {context}\nResponse:"
)


In [40]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

model_path = "./results/fine_tuned_t5_small"
tokenizer = T5TokenizerFast.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)


In [48]:
def generate_response_with_template(context, max_length=100):
    prompt = template.format(context=context)
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=max_length, do_sample=True)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Test inference
test_input = "I think I need help"
generated_response = generate_response_with_template(test_input)
print(generated_response)


Answer: Are you a mental therapist or therapist? Please include questions with your name and your location below or click the button below to email address.


In [None]:
conversation = LLMChain(llm='./results/fine_tuned_t5_small', prompt=prompt, verbose=True)

In [49]:
import os
results_dir = './openvino/fine_tuned_t5_small_openvino'
os.makedirs(results_dir, exist_ok=True)

In [51]:
tokenizer = T5TokenizerFast.from_pretrained('t5-small')
tokenizer.save_pretrained('./results/fine_tuned_t5_small')

('./results/fine_tuned_t5_small\\tokenizer_config.json',
 './results/fine_tuned_t5_small\\special_tokens_map.json',
 './results/fine_tuned_t5_small\\spiece.model',
 './results/fine_tuned_t5_small\\added_tokens.json',
 './results/fine_tuned_t5_small\\tokenizer.json')

In [None]:
from optimum.intel.openvino import OVModelForSeq2SeqLM


ov_model = OVModelForSeq2SeqLM.from_pretrained('./results/fine_tuned_t5_small', export=True)
tokenizer = T5TokenizerFast.from_pretrained('./results/fine_tuned_t5_small')
tokenizer.save_pretrained('./results/fine_tuned_t5_small_openvino')
ov_model.save_pretrained('./results/fine_tuned_t5_small_openvino')




In [69]:
ov_model = OVModelForSeq2SeqLM.from_pretrained('./results/fine_tuned_t5_small_openvino')

Compiling the encoder to CPU ...
Compiling the decoder to CPU ...
Compiling the decoder to CPU ...


In [62]:
def search_faiss_all_texts(query, db, embeddings, k=5):
    query_vector = embeddings.embed_query(query)
    query_vector = np.array([query_vector])
    D, I = db.index.search(query_vector, k)
    print(f"Distances: {D}")
    print(f"Indices: {I}")
    return I[0]

query = "I am sad"
indices = search_faiss_all_texts(query, db, embeddings, k=5)
print(f"Number of texts: {len(text_chunks)}")
print(f"Indices: {indices}")
results = [text_chunks[i] for i in indices]
print(f"Top results: {results}")


Number of texts: 7313
Indices: [5842 5701 1279 4969 4824]
Top results: [Document(page_content='and often wishes he could die so that\nhe will not have to feel the loneliness,\nemptiness, and the general feeling of\nbeing down and hopeless. He says thathe does not have much to look forward', metadata={'source': 'data\\textbook.pdf', 'page': 309}), Document(page_content='progressive disability triggers mourning for the loss of function. The individual\nexpresses feelings of grief and despair.', metadata={'source': 'data\\textbook.pdf', 'page': 302}), Document(page_content='mood, a reduction of negative feelings and negative self-concept, and an increase in\nenergy and confidence. The euphoric mood can shift quite suddenly to sadness. Some¬', metadata={'source': 'data\\textbook.pdf', 'page': 74}), Document(page_content='Lorea, 2007; Roffman & Stephens, 2005).\nFernandez-Montalvo et al. (2007) reported that coping with negative emotional', metadata={'source': 'data\\textbook.pdf', 'page': 

In [56]:
prompt_template="""
Use the following piece of code to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [57]:
model="./results/fine_tuned_t5_small"

In [58]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=["context","question"])
chain_type_kwargs={"prompt":PROMPT}

In [59]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

def load_llm():
    generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    llm = HuggingFacePipeline(pipeline=generator)
    return llm

llm = load_llm()



In [None]:
retriever = db.as_retriever()

retrieval_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs
)

In [90]:
def generate_answer(question):
    result = retrieval_chain({"query": question})
    print(result)  
    return result.get('answer', 'Answer not found')

In [91]:
question = "I am feeling anxious. What should I do?"
answer = generate_answer(question)
print("Answer:", answer)

{'query': 'I am feeling anxious. What should I do?', 'result': '1 1 1 1 1 1'}
Answer: Answer not found
