# Healthcare Symptom Association Discovery
## Unsupervised Learning with Apriori Algorithm

**Author**: Your Name  
**Date**: 2024  
**Objective**: Discover symptom co-occurrence patterns and disease associations using Association Rule Mining

---

### Project Overview
- **Technique**: Apriori Algorithm (Association Rule Mining)
- **Application**: Medical symptom pattern discovery
- **Output**: Association rules, visualizations, mobile app model

### Key Metrics
- **Support**: Frequency of symptom combinations
- **Confidence**: Probability of consequent given antecedent
- **Lift**: Strength of association (>1 = positive correlation)

## 1. Setup and Installation

In [None]:
# Install required packages (uncomment if running on Kaggle)
# !pip install mlxtend networkx plotly kaleido -q

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import plotly.graph_objects as go
import plotly.express as px
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import json
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("✓ Libraries imported successfully")

## 2. Data Generation

We'll generate synthetic medical data with realistic symptom-disease patterns.

In [None]:
# Copy the data_generator.py code here or upload dataset
# For Kaggle, you can upload a CSV or use the data generator

# Example: Load from Kaggle dataset
# df = pd.read_csv('/kaggle/input/disease-symptom-prediction/dataset.csv')

# Or generate synthetic data (paste data_generator code here)
# For brevity, assuming data is loaded

print("Data loaded successfully")
print(f"Shape: {df.shape}")
df.head()

## 3. Data Preprocessing

In [None]:
# Extract symptom columns
symptom_cols = [col for col in df.columns 
               if col not in ['patient_id', 'disease', 'num_symptoms', 'symptoms']]

print(f"Total symptoms: {len(symptom_cols)}")
print(f"\nSample symptoms: {symptom_cols[:10]}")

In [None]:
# Create binary matrix for Apriori
df_binary = df[symptom_cols].copy()

print(f"Binary matrix shape: {df_binary.shape}")
print(f"\nSample:")
df_binary.head()

## 4. Exploratory Data Analysis

In [None]:
# Symptom frequency
symptom_freq = df_binary.sum().sort_values(ascending=False)

plt.figure(figsize=(14, 6))
symptom_freq.head(20).plot(kind='bar', color='steelblue', alpha=0.8)
plt.title('Top 20 Most Common Symptoms', fontsize=14, fontweight='bold')
plt.xlabel('Symptom', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nMost common symptom: {symptom_freq.index[0]} ({symptom_freq.iloc[0]} occurrences)")

In [None]:
# Disease distribution
if 'disease' in df.columns:
    plt.figure(figsize=(12, 6))
    df['disease'].value_counts().plot(kind='bar', color='coral', alpha=0.8)
    plt.title('Disease Distribution', fontsize=14, fontweight='bold')
    plt.xlabel('Disease', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()

## 5. Association Rule Mining with Apriori

In [None]:
# Set thresholds
MIN_SUPPORT = 0.05
MIN_CONFIDENCE = 0.6
MIN_LIFT = 1.2

print(f"Mining parameters:")
print(f"  Min Support: {MIN_SUPPORT}")
print(f"  Min Confidence: {MIN_CONFIDENCE}")
print(f"  Min Lift: {MIN_LIFT}")

In [None]:
# Apply Apriori algorithm
print("Mining frequent itemsets...")
frequent_itemsets = apriori(df_binary, min_support=MIN_SUPPORT, use_colnames=True)

print(f"\n✓ Found {len(frequent_itemsets)} frequent itemsets")
frequent_itemsets.head(10)

In [None]:
# Generate association rules
print("Generating association rules...")
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=MIN_CONFIDENCE)

# Filter by lift
rules = rules[rules['lift'] >= MIN_LIFT]
rules = rules.sort_values('lift', ascending=False)

print(f"\n✓ Generated {len(rules)} association rules")
rules.head(10)

## 6. Rule Analysis

In [None]:
# Display top rules in readable format
print("=" * 80)
print("TOP 10 ASSOCIATION RULES")
print("=" * 80)

for idx, row in rules.head(10).iterrows():
    antecedents = ', '.join(list(row['antecedents']))
    consequents = ', '.join(list(row['consequents']))
    
    print(f"\nRule {idx + 1}:")
    print(f"  IF: {antecedents}")
    print(f"  THEN: {consequents}")
    print(f"  Support: {row['support']:.3f} | Confidence: {row['confidence']:.3f} | Lift: {row['lift']:.3f}")

## 7. Visualizations

In [None]:
# Support vs Confidence Scatter Plot
plt.figure(figsize=(12, 8))

scatter = plt.scatter(rules['support'], rules['confidence'], 
                     c=rules['lift'], s=rules['lift']*50, 
                     alpha=0.6, cmap='viridis', edgecolors='black', linewidth=0.5)

plt.xlabel('Support', fontsize=12, fontweight='bold')
plt.ylabel('Confidence', fontsize=12, fontweight='bold')
plt.title('Association Rules: Support vs Confidence (sized by Lift)', 
         fontsize=14, fontweight='bold')
plt.colorbar(scatter, label='Lift')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Top Rules Bar Chart
top_rules = rules.nlargest(20, 'lift').copy()
top_rules['rule'] = top_rules.apply(
    lambda row: f"{', '.join(list(row['antecedents']))} → {', '.join(list(row['consequents']))}", 
    axis=1
)

plt.figure(figsize=(14, 10))
y_pos = np.arange(len(top_rules))
plt.barh(y_pos, top_rules['lift'], color='steelblue', alpha=0.8)
plt.yticks(y_pos, top_rules['rule'], fontsize=9)
plt.xlabel('Lift', fontsize=12, fontweight='bold')
plt.title('Top 20 Association Rules by Lift', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Symptom Co-occurrence Heatmap
co_occurrence = df_binary.T.dot(df_binary)
top_symptoms = symptom_freq.head(20).index
co_occurrence_top = co_occurrence.loc[top_symptoms, top_symptoms]

plt.figure(figsize=(14, 12))
sns.heatmap(co_occurrence_top, annot=True, fmt='d', cmap='YlOrRd', 
           square=True, linewidths=0.5, cbar_kws={'label': 'Co-occurrence Count'})
plt.title('Top 20 Symptom Co-occurrence Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Network Graph
G = nx.DiGraph()

for _, row in rules.nlargest(30, 'lift').iterrows():
    for ant in list(row['antecedents']):
        for cons in list(row['consequents']):
            if G.has_edge(ant, cons):
                G[ant][cons]['weight'] += row['lift']
            else:
                G.add_edge(ant, cons, weight=row['lift'])

pos = nx.spring_layout(G, k=2, iterations=50, seed=42)

plt.figure(figsize=(16, 12))
node_sizes = [G.degree(node) * 300 for node in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, 
                      node_color='lightblue', alpha=0.9, 
                      edgecolors='darkblue', linewidths=2)

edges = G.edges()
weights = [G[u][v]['weight'] for u, v in edges]
nx.draw_networkx_edges(G, pos, width=[w/max(weights)*5 for w in weights],
                      alpha=0.5, edge_color='gray', 
                      arrows=True, arrowsize=20)

nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')

plt.title('Symptom Association Network (Top 30 Rules)', fontsize=16, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.show()

## 8. Algorithm Performance Comparison

We will compare three algorithms: **Apriori**, **FP-Growth**, and **ECLAT** (custom implementation) in terms of execution time and memory usage.


In [None]:
# Custom ECLAT Implementation
import time

class ECLAT:
    def __init__(self, min_support=0.05, min_items=1):
        self.min_support = min_support
        self.min_items = min_items
        self.item_tid_sets = {}
        self.frequent_itemsets = []
        self.start_time = 0
        self.end_time = 0

    def fit(self, df_binary):
        self.start_time = time.time()
        self.n_transactions = len(df_binary)
        self.min_support_count = self.min_support * self.n_transactions
        
        # 1. Transform horizontal to vertical format (Item -> TID set)
        for col in df_binary.columns:
            tids = set(df_binary.index[df_binary[col] == 1].tolist())
            if len(tids) >= self.min_support_count:
                self.item_tid_sets[frozenset([col])] = tids
                
        # 2. Mine recursively
        self._mine(list(self.item_tid_sets.keys()))
        
        self.end_time = time.time()
        return self

    def _mine(self, itemsets):
        for i in range(len(itemsets)):
            itemset_i = itemsets[i]
            tids_i = self.item_tid_sets[itemset_i]
            self.frequent_itemsets.append((itemset_i, len(tids_i)/self.n_transactions))
            
            suffix_itemsets = []
            for j in range(i + 1, len(itemsets)):
                itemset_j = itemsets[j]
                tids_j = self.item_tid_sets[itemset_j]
                tids_join = tids_i.intersection(tids_j)
                
                if len(tids_join) >= self.min_support_count:
                    new_itemset = itemset_i.union(itemset_j)
                    self.item_tid_sets[new_itemset] = tids_join
                    suffix_itemsets.append(new_itemset)
            
            if suffix_itemsets:
                self._mine(suffix_itemsets)


In [None]:
import tracemalloc
from mlxtend.frequent_patterns import fpgrowth

def run_apriori(df, min_support):
    start = time.time()
    res = apriori(df, min_support=min_support, use_colnames=True)
    end = time.time()
    return end - start, len(res)

def run_fpgrowth(df, min_support):
    start = time.time()
    res = fpgrowth(df, min_support=min_support, use_colnames=True)
    end = time.time()
    return end - start, len(res)

def run_eclat(df, min_support):
    model = ECLAT(min_support=min_support)
    model.fit(df)
    return model.end_time - model.start_time, len(model.frequent_itemsets)

def measure_performance(func, *args):
    tracemalloc.start()
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return end_time - start_time, peak / (1024 * 1024), result


In [None]:
# Run Comparison
supports = [0.2, 0.1, 0.05, 0.03]
results = {'Support': supports, 'Apriori': [], 'FP-Growth': [], 'ECLAT': []}
times = {'Apriori': [], 'FP-Growth': [], 'ECLAT': []}

print(f"{'Support':<8} | {'Apriori (s)':<12} | {'FP-Growth (s)':<12} | {'ECLAT (s)':<12}")
print("-" * 60)

for sup in supports:
    # Measure Apriori
    t_ap, m_ap, _ = measure_performance(lambda: run_apriori(df_binary, sup))
    times['Apriori'].append(t_ap)
    
    # Measure FP-Growth
    t_fp, m_fp, _ = measure_performance(lambda: run_fpgrowth(df_binary, sup))
    times['FP-Growth'].append(t_fp)
    
    # Measure ECLAT
    t_ec, m_ec, _ = measure_performance(lambda: run_eclat(df_binary, sup))
    times['ECLAT'].append(t_ec)
    
    print(f"{sup:<8.2f} | {t_ap:.4f}s      | {t_fp:.4f}s      | {t_ec:.4f}s")


In [None]:
# Plot Execution Time Comparison
plt.figure(figsize=(10, 6))
plt.plot(supports, times['Apriori'], marker='o', label='Apriori', linewidth=2)
plt.plot(supports, times['FP-Growth'], marker='s', label='FP-Growth', linewidth=2)
plt.plot(supports, times['ECLAT'], marker='^', label='ECLAT', linewidth=2)
plt.title('Algorithm Execution Time Comparison', fontsize=14, fontweight='bold')
plt.xlabel('Minimum Support', fontsize=12)
plt.ylabel('Time (s)', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.gca().invert_xaxis()
plt.show()


## 9. Model Export for Mobile App

In [None]:
# Export rules to JSON
rules_list = []
for _, row in rules.iterrows():
    rule = {
        'antecedents': list(row['antecedents']),
        'consequents': list(row['consequents']),
        'support': float(row['support']),
        'confidence': float(row['confidence']),
        'lift': float(row['lift'])
    }
    rules_list.append(rule)

export_data = {
    'metadata': {
        'total_rules': len(rules),
        'min_support': MIN_SUPPORT,
        'min_confidence': MIN_CONFIDENCE,
        'min_lift': MIN_LIFT,
        'total_symptoms': len(symptom_cols)
    },
    'symptoms': sorted(symptom_cols),
    'rules': rules_list
}

# Save to JSON
with open('association_rules.json', 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"✓ Exported {len(rules_list)} rules to association_rules.json")
print(f"  Ready for Flutter mobile app integration!")

## 10. Summary Statistics

In [None]:
print("=" * 80)
print("ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nDataset:")
print(f"  Total patients: {len(df)}")
print(f"  Total symptoms: {len(symptom_cols)}")
print(f"  Avg symptoms per patient: {df_binary.sum(axis=1).mean():.2f}")

print(f"\nAssociation Mining:")
print(f"  Frequent itemsets: {len(frequent_itemsets)}")
print(f"  Association rules: {len(rules)}")
print(f"  Avg confidence: {rules['confidence'].mean():.3f}")
print(f"  Avg lift: {rules['lift'].mean():.3f}")

print(f"\nTop Symptom Associations:")
for idx, row in rules.head(5).iterrows():
    ant = ', '.join(list(row['antecedents']))
    cons = ', '.join(list(row['consequents']))
    print(f"  {ant} → {cons} (lift: {row['lift']:.2f})")

print("\n" + "=" * 80)
print("✓ Analysis complete! Download association_rules.json for mobile app.")
print("=" * 80)