# Cliniview Health Data Analysis and Model Development

This notebook demonstrates basic health data analysis and visualization, focusing on patient symptoms and health risk predictions for the Cliniview platform.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pickle

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

## 1. Load and Explore Sample Data

First, let's load our sample medical data and explore its structure.

In [None]:
# Load symptom mapping data
with open(os.path.join('data', 'symptom_mapping.json'), 'r') as f:
    symptom_data = json.load(f)

# Load medical terms data
with open(os.path.join('data', 'medical_terms.json'), 'r') as f:
    medical_terms = json.load(f)

# Display the keys of our datasets
print("Symptom Data Keys:")
print(list(symptom_data.keys()))

print("\nMedical Terms Data Keys:")
print(list(medical_terms.keys()))

In [None]:
# Explore the symptom combinations
symptom_combinations = symptom_data['symptom_combinations']

# Create a DataFrame for visualization
combinations_df = pd.DataFrame({
    'condition': list(symptom_combinations.keys()),
    'symptoms': list(symptom_combinations.values()),
    'symptom_count': [len(symptoms) for symptoms in symptom_combinations.values()]
})

# Display the DataFrame
combinations_df.sort_values('symptom_count', ascending=False)

## 2. Visualize Symptom-Condition Relationships

Let's visualize how symptoms are related to medical conditions.

In [None]:
# Create a mapping of symptoms to conditions
symptom_to_conditions = {}

for condition, symptoms in symptom_combinations.items():
    for symptom in symptoms:
        if symptom not in symptom_to_conditions:
            symptom_to_conditions[symptom] = []
        symptom_to_conditions[symptom].append(condition)

# Find the top symptoms by number of associated conditions
symptom_condition_counts = {symptom: len(conditions) for symptom, conditions in symptom_to_conditions.items()}
sorted_symptoms = sorted(symptom_condition_counts.items(), key=lambda x: x[1], reverse=True)

# Plot the top 10 symptoms by number of associated conditions
top_symptoms = [x[0] for x in sorted_symptoms[:10]]
top_counts = [x[1] for x in sorted_symptoms[:10]]

plt.figure(figsize=(12, 6))
sns.barplot(x=top_counts, y=top_symptoms)
plt.title('Top 10 Symptoms by Number of Associated Conditions')
plt.xlabel('Number of Conditions')
plt.ylabel('Symptom')
plt.tight_layout()
plt.show()

## 3. Generate Synthetic Patient Data

Let's create synthetic patient data for training our health risk prediction model.

In [None]:
# Function to generate synthetic patient data
def generate_synthetic_patients(n=1000):
    np.random.seed(42)  # For reproducibility
    
    # Generate basic demographic data
    ages = np.random.normal(50, 15, n)  # Mean age 50, std 15
    ages = np.clip(ages, 18, 95).astype(int)  # Clip to reasonable age range
    
    genders = np.random.choice(['Male', 'Female'], n)
    
    # Generate health metrics
    bmis = np.random.normal(26, 4, n)  # Mean BMI 26, std 4
    bmis = np.clip(bmis, 16, 45)  # Clip to reasonable BMI range
    
    systolic_bp = np.random.normal(125, 15, n)  # Mean systolic BP 125, std 15
    systolic_bp = np.clip(systolic_bp, 90, 200).astype(int)  # Clip to reasonable range
    
    diastolic_bp = np.random.normal(80, 10, n)  # Mean diastolic BP 80, std 10
    diastolic_bp = np.clip(diastolic_bp, 50, 120).astype(int)  # Clip to reasonable range
    
    glucose = np.random.normal(100, 20, n)  # Mean glucose 100, std 20
    glucose = np.clip(glucose, 70, 300).astype(int)  # Clip to reasonable range
    
    cholesterol = np.random.normal(190, 30, n)  # Mean cholesterol 190, std 30
    cholesterol = np.clip(cholesterol, 120, 300).astype(int)  # Clip to reasonable range
    
    # Generate binary health factors
    smoking = np.random.choice([0, 1], n, p=[0.75, 0.25])  # 25% smokers
    family_history_heart = np.random.choice([0, 1], n, p=[0.7, 0.3])  # 30% family history of heart disease
    family_history_diabetes = np.random.choice([0, 1], n, p=[0.8, 0.2])  # 20% family history of diabetes
    
    # Calculate risk scores (simplified formula for demo)
    cv_risk = 0.01 * (ages - 40) + 0.02 * (bmis - 25) + 0.005 * (systolic_bp - 120) + \
              0.1 * smoking + 0.05 * family_history_heart
    cv_risk = np.clip(cv_risk, 0, 1)
    
    diabetes_risk = 0.03 * (bmis - 25) + 0.002 * (glucose - 100) + \
                    0.1 * family_history_diabetes + 0.005 * (ages - 40)
    diabetes_risk = np.clip(diabetes_risk, 0, 1)
    
    # Create the DataFrame
    df = pd.DataFrame({
        'patient_id': [f'P{i:04d}' for i in range(1, n+1)],
        'age': ages,
        'gender': genders,
        'bmi': bmis,
        'systolic_bp': systolic_bp,
        'diastolic_bp': diastolic_bp,
        'glucose': glucose,
        'cholesterol': cholesterol,
        'smoking': smoking,
        'family_history_heart': family_history_heart,
        'family_history_diabetes': family_history_diabetes,
        'cardiovascular_risk': cv_risk,
        'diabetes_risk': diabetes_risk
    })
    
    # Add categorical risk levels
    df['cv_risk_category'] = pd.cut(
        df['cardiovascular_risk'], 
        bins=[0, 0.4, 0.7, 1], 
        labels=['Low', 'Medium', 'High']
    )
    
    df['diabetes_risk_category'] = pd.cut(
        df['diabetes_risk'], 
        bins=[0, 0.4, 0.7, 1], 
        labels=['Low', 'Medium', 'High']
    )
    
    return df

# Generate synthetic data
patient_data = generate_synthetic_patients(1000)
patient_data.head()

## 4. Visualize Health Risk Distribution

Let's visualize the distribution of health risks in our synthetic data.

In [None]:
# Plot the distribution of cardiovascular risk scores
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(patient_data['cardiovascular_risk'], kde=True)
plt.title('Distribution of Cardiovascular Risk')
plt.xlabel('Risk Score')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.histplot(patient_data['diabetes_risk'], kde=True)
plt.title('Distribution of Diabetes Risk')
plt.xlabel('Risk Score')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Plot the distribution of risk categories
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x='cv_risk_category', data=patient_data)
plt.title('Cardiovascular Risk Categories')
plt.xlabel('Risk Category')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.countplot(x='diabetes_risk_category', data=patient_data)
plt.title('Diabetes Risk Categories')
plt.xlabel('Risk Category')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

## 5. Analyze Correlations between Health Metrics

Let's examine how different health metrics correlate with risk scores.

In [None]:
# Select numeric columns for correlation analysis
numeric_columns = ['age', 'bmi', 'systolic_bp', 'diastolic_bp', 'glucose', 'cholesterol', 
                   'smoking', 'family_history_heart', 'family_history_diabetes',
                   'cardiovascular_risk', 'diabetes_risk']

# Calculate the correlation matrix
corr_matrix = patient_data[numeric_columns].corr()

# Plot the correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Health Metrics')
plt.tight_layout()
plt.show()

## 6. Build a Simple Risk Prediction Model

Let's train a machine learning model to predict cardiovascular risk categories based on health metrics.

In [None]:
# Prepare features and target for cardiovascular risk prediction
X = patient_data[['age', 'bmi', 'systolic_bp', 'diastolic_bp', 'cholesterol', 'smoking', 'family_history_heart']]
y = patient_data['cv_risk_category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Low', 'Medium', 'High'],
            yticklabels=['Low', 'Medium', 'High'])
plt.title('Confusion Matrix for Cardiovascular Risk Prediction')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## 7. Feature Importance Analysis

Let's analyze which health metrics are most important for predicting cardiovascular risk.

In [None]:
# Get feature importances
importances = rf_model.feature_importances_
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title('Feature Importances for Cardiovascular Risk Prediction')
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), [feature_names[i] for i in indices], rotation=45)
plt.tight_layout()
plt.show()

## 8. Save the Model for Production Use

Let's save our trained model for use in the Cliniview ML service.

In [None]:
# Create the models/saved directory if it doesn't exist
os.makedirs(os.path.join('models', 'saved'), exist_ok=True)

# Save the model
model_path = os.path.join('models', 'saved', 'risk_prediction.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(rf_model, f)

print(f"Model saved to {model_path}")

## 9. Conclusion

In this notebook, we've demonstrated:

1. Loading and exploring medical terminology and symptom mapping data
2. Visualizing symptom-condition relationships
3. Generating synthetic patient data for analysis
4. Analyzing health risk distributions and correlations
5. Building a predictive model for cardiovascular risk
6. Identifying the most important factors for risk prediction

This analysis provides a foundation for the ML services in the Cliniview platform, which include:
- Symptom checking and condition prediction
- Health summary generation
- Health risk prediction

The saved model can be integrated with the FastAPI service to provide risk predictions for real patients.