In [10]:
import pandas as pd

# Load datasets (adjust file paths as needed)
medical_df = pd.read_csv('ai-medical-chatbot.csv')

mental_health_df = pd.read_csv('mental-health.csv')


# Combine all datasets into one
df = pd.concat([medical_df, mental_health_df, ], ignore_index=True)

In [11]:
# Optional preprocessing: lowercase questions
df['Description'] = df['Description'].str.lower()

In [12]:
df['Patient'] = df['Patient'].str.lower()
df['Doctor'] = df['Doctor'].str.lower()
df['Context'] = df['Context'].str.lower()
df['Response'] = df['Response'].str.lower()

In [13]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from joblib import Parallel, delayed

# Step 1: Define the datasets and their column names
datasets = [
    {'file': 'ai-medical-chatbot.csv', 'question_col': 'Patient', 'answer_col': 'Doctor'},
    {'file': 'mental-health.csv', 'question_col': 'Context', 'answer_col': 'Response'},
]

# Step 2: Load and combine the datasets
dfs = []
for dataset in datasets:
    try:
        df_temp = pd.read_csv(dataset['file'], usecols=[dataset['question_col'], dataset['answer_col']])
        df_temp = df_temp.rename(columns={dataset['question_col']: 'question', dataset['answer_col']: 'answer'})
        dfs.append(df_temp)
    except Exception as e:
        print(f"Error loading {dataset['file']}: {e}")

# Combine all data into one DataFrame
df = pd.concat(dfs, ignore_index=True)

# Optional: Use only a random sample of 10,000 rows for faster processing
df = df.sample(n=min(10000, len(df)), random_state=42)

# Step 3: Basic preprocessing (convert questions to lowercase)
df['question'] = df['question'].astype(str).str.lower()

# Step 4: Load a lightweight embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 5: Define a batch processing function
def process_batch(batch):
    return model.encode(batch, show_progress_bar=False)

# Step 6: Split data into batches
batch_size = 2000  # Adjust based on available RAM
batches = [df['question'][i:i + batch_size].tolist() for i in range(0, len(df), batch_size)]

# Step 7: Generate embeddings in parallel (faster execution)
embeddings = Parallel(n_jobs=-1)(
    delayed(process_batch)(batch) for batch in tqdm(batches, desc="Generating Embeddings")
)

# Flatten embeddings list
embeddings = np.vstack(embeddings)

# Step 8: Save embeddings and answers efficiently 
joblib.dump(embeddings, 'embeddings.joblib')
joblib.dump(df['answer'].tolist(), 'answers.joblib')

print("✅ Embeddings and answers saved successfully!")


Generating Embeddings: 100%|██████████| 5/5 [00:00<00:00,  5.97it/s]


✅ Embeddings and answers saved successfully!
