# Medical Symptoms Checker - EDA and Preprocessing

This notebook explores the symptom data and tests the preprocessing pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add parent directory to path to import our modules
sys.path.append('..')

from src.preprocessing import PreprocessingAgent
from src.config import SYMPTOM_VOCABULARY, TRIAGE_LEVELS

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

## 1. Load and Explore Data

In [None]:
# Load sample data
df = pd.read_csv('../data/sample_symptom_cases.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nTriage Label Distribution:")
print(df['triage_label'].value_counts().sort_index())

# Map labels to readable names
df['triage_name'] = df['triage_label'].map(TRIAGE_LEVELS)
print("\nTriage Distribution:")
print(df['triage_name'].value_counts())

In [None]:
# Visualize triage distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Triage distribution
df['triage_name'].value_counts().plot(kind='bar', ax=ax1)
ax1.set_title('Triage Level Distribution')
ax1.set_xlabel('Triage Level')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Age distribution by triage
sns.boxplot(data=df, x='triage_name', y='age', ax=ax2)
ax2.set_title('Age Distribution by Triage Level')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 2. Test Preprocessing Pipeline

In [None]:
# Initialize preprocessing agent
preprocessor = PreprocessingAgent()

# Test on sample texts
sample_texts = [
    "I have severe chest pain and can't breathe",
    "Running nose and mild headache",
    "High fever with body aches for 3 days"
]

print("Testing Preprocessing Pipeline:")
print("=" * 50)

for i, text in enumerate(sample_texts, 1):
    print(f"\nExample {i}: {text}")
    
    # Clean text
    cleaned = preprocessor.clean_text(text)
    print(f"Cleaned: {cleaned}")
    
    # Extract symptoms
    symptoms = preprocessor.extract_symptoms(cleaned)
    detected = [k for k, v in symptoms.items() if v]
    print(f"Detected symptoms: {detected}")
    
    # Extract severity
    severity = preprocessor.extract_severity_indicators(cleaned)
    print(f"Severity indicators: {severity}")
    
    print("-" * 30)

In [None]:
# Process entire dataset
print("Processing entire dataset...")
processed_df = preprocessor.preprocess_for_training(df, 'complaint_text')

print(f"Processed dataset shape: {processed_df.shape}")
print(f"\nFeature columns: {processed_df.columns.tolist()}")

# Show sample of processed features
print("\nSample processed features:")
processed_df.head()

## 3. Analyze Symptom Patterns

In [None]:
# Analyze symptom frequency
symptom_cols = [col for col in processed_df.columns if col in SYMPTOM_VOCABULARY.keys()]
symptom_freq = processed_df[symptom_cols].sum().sort_values(ascending=False)

print("Symptom Frequency:")
print(symptom_freq)

# Plot symptom frequency
plt.figure(figsize=(12, 6))
symptom_freq.plot(kind='bar')
plt.title('Symptom Frequency in Dataset')
plt.xlabel('Symptoms')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Analyze symptom patterns by triage level
# Add triage labels to processed data
processed_df['triage_label'] = df['triage_label']
processed_df['triage_name'] = df['triage_name']

# Calculate symptom frequency by triage level
triage_symptom_analysis = {}

for triage_level in processed_df['triage_name'].unique():
    subset = processed_df[processed_df['triage_name'] == triage_level]
    symptom_freq = subset[symptom_cols].sum()
    triage_symptom_analysis[triage_level] = symptom_freq

# Create DataFrame for analysis
symptom_by_triage = pd.DataFrame(triage_symptom_analysis).fillna(0)
print("Symptom frequency by triage level:")
print(symptom_by_triage)

In [None]:
# Visualize symptom patterns by triage level
plt.figure(figsize=(14, 8))
sns.heatmap(symptom_by_triage.T, annot=True, cmap='YlOrRd', fmt='g')
plt.title('Symptom Patterns by Triage Level')
plt.xlabel('Triage Level')
plt.ylabel('Symptoms')
plt.tight_layout()
plt.show()

## 4. Feature Engineering Analysis

In [None]:
# Analyze feature distributions
feature_cols = ['symptom_count', 'text_length', 'pain_severity', 'duration_severity', 'intensity_severity']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(feature_cols):
    if col in processed_df.columns:
        sns.boxplot(data=processed_df, x='triage_name', y=col, ax=axes[i])
        axes[i].set_title(f'{col} by Triage Level')
        axes[i].tick_params(axis='x', rotation=45)

# Remove empty subplot
if len(feature_cols) < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

## 5. Save Processed Data

In [None]:
# Save processed dataset for model training
output_path = '../data/processed_symptom_cases.csv'
processed_df.to_csv(output_path, index=False)
print(f"Processed dataset saved to: {output_path}")

# Summary statistics
print("\nDataset Summary:")
print(f"Total samples: {len(processed_df)}")
print(f"Features: {len(processed_df.columns)}")
print(f"Triage distribution:")
print(processed_df['triage_name'].value_counts())
print(f"\nAverage symptoms per case: {processed_df['symptom_count'].mean():.2f}")
print(f"Average text length: {processed_df['text_length'].mean():.2f} words")