In [81]:
toCount = 'Jill, a 32-year-old Afghanistan war veteran, had been experiencing PTSD symptoms for more than five years. She consistently avoided thoughts and images related to witnessing her fellow service members being hit by an improvised explosive device (IED) while driving a combat supply truck. Over the years, Jill became increasingly depressed and began using alcohol on a daily basis to help assuage her PTSD symptoms. She had difficulties in her employment, missing many days of work, and she reported feeling disconnected and numb around her husband and children. In addition to a range of other PTSD symptoms, Jill had a recurring nightmare of the event in which she was the leader of a convoy and her lead truck broke down. She waved the second truck forward, the truck that hit the IED, while she and her fellow service members on the first truck worked feverishly to repair it. Consistent with the traumatic event, her nightmare included images of her and the service members on the first truck smiling and waving at those on the second truck, and the service members on the second truck making fun of the broken truck and their efforts to fix it — “Look at that piece of junk truck — good luck getting that clunker fixed.'
len(toCount)

1224

In [34]:
import pandas as pd

## Load the general guideline from a.csv file
QA_catalogue = './data/Mental_Health_FAQ.csv'
qa_df = pd.read_csv(QA_catalogue)

# Select the question and answer columns
df = qa_df
selected_columns = df.iloc[:, [1, 2]]
combined_columns = selected_columns.apply(lambda x: ''.join(x), axis=1)
row_list = combined_columns.tolist()

## load client information, past conversation, and process the conversation
past_conversations = "./data/transcript2"
with open(past_conversations, "r") as file:
    file_content = file.read()

# Replace client with Jill
updated_content = file_content.replace('client', 'Jill').replace('\n', ' ').replace('\t', ' ')

# Split the content into general information and dialogues
dialogues = updated_content.split('Therapist:')
general_info, dialogues = [dialogues[0]], dialogues[1:]

# Process each dialogue
processed_dialogues = []
for dialogue in dialogues:
    parts = dialogue.split('Jill:')
    for part in parts:
        processed_dialogues.append('Therapist:' + part if part.startswith(' ') else 'Jill:' + part)

## concatenate all contents 
combined_infos = row_list + processed_dialogues

In [36]:
from numpy.linalg import norm
from transformers import AutoModel, AutoTokenizer

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model_name = 'jinaai/jina-embeddings-v2-small-en'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # trust_remote_code is needed to use the encode method
embeddings = model.encode(combined_infos)
# print(cos_sim(embeddings[0], embeddings[1]))
import faiss
import numpy as np
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Saving the index
faiss.write_index(index, "embeddings.index")

# Later, to perform a similarity search
index = faiss.read_index("embeddings.index")
query_embedding = model.encode('I feel lonely after returning from battle')  # Your query embedding

k = 10  # number of nearest neighbors to find
D, I = index.search(query_embedding.reshape(1, -1), k)  # Search

In [78]:
## from chatgpt

from dotenv import load_dotenv
import os
import tiktoken
from openai import OpenAI

os.environ["TOKENIZERS_PARALLELISM"] = "false"
load_dotenv()
openai_api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)
gpt3 = 'gpt-3.5-turbo-0301'
gpt3_turbo = 'gpt-3.5-turbo'
gpt4 = 'gpt-4-1106-preview'

def retrieve_similar_sentences(query, k=3, filter_distance=41):
    query_embedding = model.encode(query)
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding.reshape(1, -1)
    D, I = index.search(query_embedding, k)
    
    # Filter out indices with distance greater than 41
    filtered_indices = [I[0][i] for i in range(len(D[0])) if D[0][i] < filter_distance]
    filtered_distances = [D[0][i] for i in range(len(D[0])) if D[0][i] < filter_distance]
    print(filtered_distances, filtered_indices)
    return filtered_distances, filtered_indices


# Your chatbot system definition and start message
system = 'You are a skillful and empathetic AI \
        therapist named mendu chat who practices Cognitive Behavioral Therapy (CBT) to help her clients. \
        You help your clients identify and change \
        the irrational thoughts and beliefs that are causing them to suffer. \
        You also help them learn and practice coping skills to help them \
        better manage their stress and anxiety.'

system_updated = f'{system} Information of your patient: {general_info[0]}'

start_message = 'This is the Mendu Chat. How are you feeling today?'
def ask(question: str, chat_log: list, model=gpt3, temp=0.9):

    if not chat_log:
        messages = [{'role': 'system', 'content': system_updated},
                    {'role': 'assistant', 'content': start_message}]
    else:
        # chat_log.append({'role': 'user', 'content': question})
        messages = chat_log    

    # Retrieve similar sentences
    _, similar_indices = retrieve_similar_sentences(question)
    if similar_indices:
        retrieval_content = ' '.join(
            [f'{combined_infos[idx]}' for idx in similar_indices]
        )
        messages.append({'role': 'system', 'content': retrieval_content})
    
    messages.append({'role': 'user', 'content': question})

    response = client.chat.completions.create(
        messages=messages,
        model=model,
        temperature=temp,
    )
        
    answer = response.choices[0].message.content

    # remove the role system content from retrieval
    if similar_indices:
        messages.pop(-2)
    
    messages.append({'role': 'assistant', 'content': answer})
    for message in messages:
        print(f'len: {len(message["content"])}')
        print(f"{message['role']}: {message['content']}")  
        print('---')

    print(sum(len(message['content']) for message in messages))

    return str(answer), messages


In [79]:
if __name__ == '__main__':
    chat_log = []  # Initialize the chat_log
    print(start_message)

    while True:
        question = input("You: ")
        if question.lower() == 'quit':
            break

        answer, chat_log = ask(question, chat_log)
        print("Mendu Chat:", answer)

This is the Mendu Chat. How are you feeling today?
[] []
len: 2496
system: You are a skillful and empathetic AI         therapist named mendu chat who practices Cognitive Behavioral Therapy (CBT) to help her clients.         When applying CBT, you help your clients identify and change         the irrational thoughts and beliefs that are causing them to suffer.         You also help them learn and practice coping skills to help them         better manage their stress and anxiety. Information of your patient: Jill, a 32-year-old Afghanistan war veteran, had been experiencing PTSD symptoms for more than five years. She consistently avoided thoughts and images related to witnessing her fellow service members being hit by an improvised explosive device (IED) while driving a combat supply truck. Over the years, Jill became increasingly depressed and began using alcohol on a daily basis to help assuage her PTSD symptoms. She had difficulties in her employment, missing many days of work, and s

In [115]:
question = 'im so sad'
answer, logs = ask(question, '')
print(answer)
print(('-----'))
print(logs)
question2 = 'I have a third child and I cant deal with so many kids'
answer, logs = ask(question2, logs)
print(answer)
print(('-----'))
print(logs)
question3 = 'My older son is 5 years old and he is very naughty, he is trying to hurt his younger brother'
answer, logs = ask(question2, logs)
print(answer)
print(('-----'))
print(logs)

I'm sorry to hear that you're feeling sad. It takes strength to reach out for help, and I'm here to listen and help you feel better. Can you tell me more about what's making you feel sad?
-----
[{'role': 'system', 'content': 'You are a compasionate, insightful, and empathetic AI         therapist named Joy who practices Cognitive Behavioral Therapy (CBT) to help her clients.         When applying CBT, you help your clients identify and change         the irrational thoughts and beliefs that are causing them to suffer.         You also help them learn and practice coping skills to help them         better manage their stress and anxiety. You are a very good therapist.'}, {'role': 'assistant', 'content': 'I am Joy, your AI therapist. How are you feeling today?'}, {'role': 'user', 'content': 'im so sad'}, {'role': 'assistant', 'content': "I'm sorry to hear that you're feeling sad. It takes strength to reach out for help, and I'm here to listen and help you feel better. Can you tell me mor

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(combined_infos)

In [None]:
# need to implement:
# 1. rules for the chatbot provided by demi, state of the chatbot, so it knows what to do next
# 2. save the chat log into a file, summarize and to be retrieved later as vector embeddings

# sentiment analysis
# summarization of the chat log


# ---------------------------------------
code below uses langchain vectorstore, needs to be adapted, not working 

In [1]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# define the document to retrieve
# here: the catalogue of standardized questions and answers
# in our case: the catalogue + User specific data (age, gender, etc.) + past conversations (granularity needs to be decided)
QA_catalogue = '/home/ll/data/mendu_chat/Mental_Health_FAQ.csv'

# Define the columns we want to embed vs which ones we want in metadata
columns_to_embed = ["Questions","Answers"]
columns_to_metadata = ["Questions","Answers"]


# Process the CSV into the embedable content vs the metadata and put it into Document format so that we can chunk it into pieces.
docs = []
with open((QA_catalogue), newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

# Lets split the document using Chracter splitting. 
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)
# Now that we have the chunks, we will generate the embeddings and insert the values into Chroma. Each vector inserted will have both the vector representation that will be used for similarity search as well as the metadata values we added.

# Generate embeddings from documents and store in a vector database
embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, OpenAIEmbeddings())
# Query the vector for information.
query = "I have no friends and feel lonely."
docs = db.similarity_search(query)
print(docs[0].page_content)
print(docs[0].metadata)

Created a chunk of size 514, which is longer than the specified 500
Created a chunk of size 579, which is longer than the specified 500
Created a chunk of size 612, which is longer than the specified 500
Created a chunk of size 597, which is longer than the specified 500
Created a chunk of size 744, which is longer than the specified 500
Created a chunk of size 894, which is longer than the specified 500
Created a chunk of size 571, which is longer than the specified 500
Created a chunk of size 568, which is longer than the specified 500
Created a chunk of size 786, which is longer than the specified 500
Created a chunk of size 527, which is longer than the specified 500
Created a chunk of size 570, which is longer than the specified 500
Created a chunk of size 825, which is longer than the specified 500
Created a chunk of size 765, which is longer than the specified 500
Created a chunk of size 675, which is longer than the specified 500
Created a chunk of size 702, which is longer tha

In [5]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="Questions",
        description="A question the user or patient might ask",
        type="string",
    ),
    AttributeInfo(
        name="Answers",
        description="One or more answers to the question asked by the user or patient",
        type="string",
    ),
]
document_content_description = "standardized question and answer catalogue for mental health"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, db, document_content_description, metadata_field_info, verbose=True)


# Retrieve values
retriever.get_relevant_documents("I feel lonely")

# The result is:
# Description: Achieve your fitness goals with our Fitness Tracker Smartwatch. Monitor your activity, heart rate, and receive notifications on your wrist.\n
# Features: - Heart rate monitor and activity tracking.<br> - Built-in GPS for accurate workout tracking.<br> - Sleep analysis and guided breathing exercises.<br> - Receive notifications from your smartphone.<br> - Water-resistant for workouts and everyday use.<br> - Long battery life.', 
# metadata={
#    'Product Name': 'Fitness Tracker Smartwatch', 
#    'Price': '149.99', 
#    'Rating': '4.5', 
#    'Description': 'Achieve your fitness goals with our Fitness Tracker Smartwatch. Monitor your activity, heart rate, and receive notifications on your wrist.', 
#    'Features': '- Heart rate monitor and activity tracking.<br> - Built-in GPS for accurate workout tracking.<br> - Sleep analysis and guided breathing exercises.<br> - Receive notifications from your smartphone.<br> - Water-resistant for workouts and everyday use.<br> - Long battery life.'
# }

[Document(page_content="Everyone feels lonely at times—maybe you recently moved to a new city, are changing your circle of friends, lost someone important in your life, or lost your job and also lost important social connections with coworkers. Other people may have physical connections to others but may feel like their emotional or social needs aren't met. Measures like social distancing or self-isolation can make loneliness feel worse no matter why you feel lonely now.", metadata={'Answers': "A lot of people are alone right now, but we don't have to be lonely. We're all in this together. \n While you may be physically separated from friends, family members, and other loved ones, it has never been more important to maintain those social connections. Social connections are an opportunity to seek and share support, talk through difficult feelings, share a laugh, keep up-to-date with loved ones, and help each other cope. This pandemic is a lot for one person to deal with on their own. Wh