# Medical Assistant Bot - Data Analysis

This notebook performs comprehensive data analysis to:
1. Load and examine our medical conversation dataset
2. Analyze data quality, distribution, and characteristics
3. Determine if dataset augmentation is needed
4. Make recommendations for preprocessing and model architecture


In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import json

from dataset_loader import MedicalDatasetLoader
from data_processor import MedicalDataProcessor

plt.style.use('default')
sns.set_palette("husl")

print(" All libraries imported successfully")


In [None]:
loader = MedicalDatasetLoader()

print(" Loading medical conversation dataset...")
data = loader.load_dataset('sample')
synthetic_data = loader.load_dataset('synthetic')

combined_data = loader.combine_datasets(['sample', 'synthetic'])

print(f"Loaded {len(data)} sample entries")
print(f"Loaded {len(synthetic_data)} synthetic entries")
print(f"Combined total: {len(combined_data)} entries")

df = pd.DataFrame(combined_data)

print(f"\n Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()


In [None]:
print(" DATASET ADEQUACY ASSESSMENT")
print("=" * 50)

total_entries = len(df)
unique_categories = df['medical_category'].nunique()
unique_conditions = df['condition'].nunique()

df['input_length'] = df['input'].str.len()
df['response_length'] = df['response'].str.len()
avg_input_length = df['input_length'].mean()
avg_response_length = df['response_length'].mean()

print(f"\n1. DATASET SIZE:")
print(f"   Total entries: {total_entries}")
print(f"   Medical categories: {unique_categories}")
print(f"   Unique conditions: {unique_conditions}")

print(f"\n2. TEXT QUALITY:")
print(f"   Avg input length: {avg_input_length:.0f} characters")
print(f"   Avg response length: {avg_response_length:.0f} characters")

needs_augmentation = total_entries < 100 or unique_categories < 5

print(f"\n3. RECOMMENDATION:")
if needs_augmentation:
    print("     Dataset needs augmentation for robust training")
    print("    Proceed to data augmentation phase")
else:
    print("    Dataset adequate for initial training")
    print("    Proceed to preprocessing phase")

print(f"\n4. MEDICAL CATEGORY DISTRIBUTION:")
category_counts = df['medical_category'].value_counts()
for category, count in category_counts.items():
    percentage = (count / len(df)) * 100
    print(f"   {category}: {count} ({percentage:.1f}%)")

# Show urgency distribution  
print(f"\n5. URGENCY LEVEL DISTRIBUTION:")
urgency_counts = df['urgency'].value_counts()
for urgency, count in urgency_counts.items():
    percentage = (count / len(df)) * 100
    print(f"   {urgency}: {count} ({percentage:.1f}%)")

print(f"\n Analysis complete! Ready for next phase.")
print(f" Next: {'Data Augmentation' if needs_augmentation else 'Data Preprocessing'}")
