In [1]:
# Install necessary libraries
#%pip install -q pypdf langchain langchain-community langchain_mistralai langchain-huggingface faiss-cpu sentence-transformers

In [1]:
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_mistralai import ChatMistralAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.memory import ConversationSummaryBufferMemory

In [2]:
import getpass

if "MISTRAL_API_KEY" not in os.environ:
     os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter your Mistral API key: ")

In [3]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    pdf_reader = PdfReader(pdf_path)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

In [4]:
# Function to process PDF and create vector store
def process_pdf(pdf_path):
    # Extract text from PDF
    raw_text = extract_text_from_pdf(pdf_path)
    
    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len
    )
    
    texts = text_splitter.split_text(raw_text)
    
    # Create embeddings
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Create vector store
    vectorstore = FAISS.from_texts(texts, embeddings)
    
    return vectorstore

In [5]:
llm = ChatMistralAI(
    model="mistral-large-latest",  #
    temperature=0.7,  # increased more creativity while maintaining focus
    max_retries=3,  # Increased for better reliability
    max_tokens=3000,  # Increased max output length for more comprehensive responses
    presence_penalty=0.1,  # Slight penalty to reduce repetition
    frequency_penalty=0.1,  # Slight penalty to encourage more diverse vocabulary
    safe_mode=False,  # Disable safe mode if you need more flexible outputs
    random_seed=42,  # Set a random seed for reproducibility
)

In [6]:
# Process PDF and create vector store
pdf_path = "./docs/mtc.pdf"
vectorstore = process_pdf(pdf_path)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 74 0 (offset 0)
Ignoring wrong pointing object 76 0 (offset 0)
Ignoring wrong pointing object 78 0 (offset 0)
Ignoring wrong pointing object 80 0 (offset 0)
Ignoring wrong 

In [7]:
def get_relevant_chunks(query, vectorstore, k=5):
    return vectorstore.similarity_search(query, k=k)

def format_context(relevant_chunks):
    return "\n\n".join([chunk.page_content for chunk in relevant_chunks])

In [8]:
memory = ConversationSummaryBufferMemory(
    llm=llm,
    max_token_limit=1000,
    return_messages=True
)

  memory = ConversationSummaryBufferMemory(


In [9]:
s

In [10]:
# Create the chain
chain = prompt | llm

In [11]:
def ask_question(question):
    relevant_chunks = get_relevant_chunks(question, vectorstore)
    context = format_context(relevant_chunks)
    
    # Get chat history from memory
    chat_history = memory.load_memory_variables({})["history"]
    
    # Invoke the chain
    response = chain.invoke({
        "context": context,
        "chat_history": chat_history,
        "question": question
    })
    
    # Save the interaction to memory
    memory.save_context({"input": question}, {"output": response.content})
    
    return response.content.strip()

In [15]:
question = "What is the method use in this study?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the method use in this study?
Answer: Based on the provided context and chat history, the study employs several methods for the multiclass mental illness classification task. Here’s a detailed overview:

1. **Machine Learning Models**:
   - **Logistic Regression**: Chosen for its simplicity and ease of interpretation.
   - **Multinomial Naive Bayes**: Effective for text classification tasks and works well with high-dimensional datasets.
   - **Linear Support Vector Machine (LSVM)**: Known for handling large feature spaces and strong performance in text classification tasks. Among these models, the TF-IDF with LSVM model performed best overall, achieving the highest accuracy (0.772) and F1-score (0.772).

2. **Deep Learning Models**:
   - **Bidirectional Long Short-Term Memory (BiLSTM)**: Designed with two bidirectional recurrent layers with 256 and 128 units, respectively, followed by dropout layers for regularization. It includes a dense layer with 64 units and recti

In [13]:
question = "What are the machine learning model they use in this study?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What are the machine learning model they use in this study?
Answer: Based on the provided context, the study employs three well-known machine learning models for the multiclass mental illness classification task:

1. **Logistic Regression**: This model is chosen for its simplicity and ease of interpretation.
2. **Multinomial Naive Bayes**: This model is effective for text classification tasks and works well with high-dimensional datasets.
3. **Linear Support Vector Machine (LSVM)**: This model is known for its ability to handle large feature spaces and is a strong performer in text classification tasks.

These models are used as a traditional baseline approach for the classification task. Among them, the TF-IDF with LSVM model performed best overall, achieving the highest accuracy (0.772) and F1-score (0.772).

If you have further questions or need more details about any specific model or their performance, feel free to ask!


In [14]:
question = "What are the deep learning model they use in this study?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What are the deep learning model they use in this study?
Answer: Based on the provided context, the study employs two types of deep learning models for the multiclass mental illness classification task:

1. **Bidirectional Long Short-Term Memory (BiLSTM)**: This model is designed with two bidirectional recurrent layers with 256 and 128 units, respectively, followed by dropout layers for regularization. The model also includes a dense layer with 64 units and rectified linear unit activation, preceding the final output layer, which uses softmax activation for multiclass classification. The model is optimized using the Adam optimizer with a configurable learning rate.

2. **Bidirectional Gated Recurrent Unit (BiGRU)**: This model shares the same architecture as the BiLSTM model, except for the type of recurrent layer used. It also comprises two bidirectional recurrent layers with 256 and 128 units, respectively, followed by dropout layers with a rate of 0.25 for regularization. 

In [16]:
question = "What is the paper title in this study?"
answer = ask_question(question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the paper title in this study?
Answer: Based on the provided context and chat history, the paper title in this study is:

**"MIRoBERTa: Mental Illness Text Classification with Transfer Learning on Subreddits"**

The authors of this paper are:
- MAVIN SAO
- HOI-JEONG LIM

Hoi-Jeong Lim is also identified as the corresponding author, with an email address of hjlim@jnu.ac.kr.

The study focuses on the application of machine learning and deep learning models for mental illness text classification, utilizing data from Reddit subreddits. If you have any further questions about the study or need additional details, feel free to ask!
