SECTION 1. Imports are placed here.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.stats import gaussian_kde

# Set seed for reproducibility
np.random.seed(1234)

SECTION 2. This part is the "Medication Adherance Analysis". In here, we are analyzing when patients refill their prescriptions to understand adherence patterns and also help in identify potential issues and stuff.

In [None]:
# Define the MedicationAnalyzer class
class MedicationAnalyzer:
    """Class for analyzing medication adherence patterns"""
    
    def __init__(self, data_path='med_events.csv'):
        """Initialize the analyzer with data path"""
        self.data_path = data_path
        self.medication_data = None
        self.load_data()
    
    def load_data(self):
        """Load and preprocess medication data"""
        # Read the data
        df = pd.read_csv(self.data_path)
        
        # Rename columns to match expected format
        self.medication_data = df.copy()
        self.medication_data.columns = ["pnr", "eksd", "perday", "ATC", "dur_original"]
        
        # Convert date
        self.medication_data['eksd'] = pd.to_datetime(self.medication_data['eksd'])
        
        print(f"Loaded {len(self.medication_data)} prescription records")

SECTION 3. This is where the "Helper Functions" for data visualization are located.

In [None]:
def _plot_ecdf(self, dfper_80, sorted_vals, ecdf_y, medication_code):
        """Plot empirical cumulative distribution functions"""
        fig, axs = plt.subplots(1, 2, figsize=(12, 5))
        
        # 80% ECDF
        axs[0].plot(dfper_80['x'], dfper_80['y'], marker='.', linestyle='none')
        axs[0].set_title(f"80% ECDF ({medication_code})")
        axs[0].set_xlabel("Event Interval")
        axs[0].set_ylabel("ECDF")
        
        # 100% ECDF
        axs[1].plot(sorted_vals, ecdf_y, marker='.', linestyle='none')
        axs[1].set_title(f"100% ECDF ({medication_code})")
        axs[1].set_xlabel("Event Interval")
        axs[1].set_ylabel("ECDF")
        
        plt.tight_layout()
        plt.savefig(f"{medication_code}_ecdf.png")
        plt.show()
    
    def _plot_patient_counts(self, drug_p1, medication_code):
        """Plot frequency counts of patients"""
        counts = drug_p1['pnr'].value_counts()
        plt.figure(figsize=(8, 6))
        counts.plot(kind='bar')
        plt.title(f"Frequency of pnr ({medication_code})")
        plt.xlabel("Patient ID")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.savefig(f"{medication_code}_patient_counts.png")
        plt.show()
    
    def _plot_log_density(self, log_vals, medication_code):
        """Plot density of log intervals"""
        density = gaussian_kde(log_vals)
        x = np.linspace(log_vals.min(), log_vals.max(), 100)
        y = density(x)
        
        plt.figure(figsize=(8, 6))
        plt.plot(x, y)
        plt.title(f"Log(event interval) Density ({medication_code})")
        plt.xlabel("Log(event interval)")
        plt.ylabel("Density")
        plt.tight_layout()
        plt.savefig(f"{medication_code}_log_density.png")
        plt.show()
        
    def _find_optimal_clusters(self, data, max_clusters=10):
        """Find optimal number of clusters using silhouette score"""
        silhouette_scores = {}
        
        # Try various cluster counts
        for k in range(2, min(max_clusters, len(data) - 1) + 1):
            kmeans = KMeans(n_clusters=k, random_state=1234)
            labels = kmeans.fit_predict(data)
            
            try:
                score = silhouette_score(data, labels)
                silhouette_scores[k] = score
            except:
                continue
        
        # Plot silhouette scores
        plt.figure(figsize=(8, 6))
        plt.plot(list(silhouette_scores.keys()), list(silhouette_scores.values()), marker='o')
        plt.title("Silhouette Analysis")
        plt.xlabel("Number of Clusters")
        plt.ylabel("Silhouette Score")
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig("silhouette_scores.png")
        plt.show()
        
        # Return optimal number of clusters
        return max(silhouette_scores, key=silhouette_scores.get) if silhouette_scores else 2

SECTION 4. This is where the "Cluster Visualization" Functions are located. 

def _plot_clusters(self, dfper, cluster_stats, medication_code):
        """Create visualization of the identified clusters"""
        plt.figure(figsize=(12, 6))
        
        # Create scatter plot with points colored by cluster
        unique_clusters = dfper['cluster'].unique()
        colors = plt.cm.tab10(np.linspace(0, 1, len(unique_clusters)))
        
        for i, cluster in enumerate(unique_clusters):
            cluster_data = dfper[dfper['cluster'] == cluster]
            plt.scatter(cluster_data['x'], cluster_data['y'], 
                       s=30, c=[colors[i]], alpha=0.7,
                       label=f"Cluster {cluster}: {cluster_stats[cluster_stats['cluster'] == cluster]['Median'].values[0]:.1f} days")
        
        # Add vertical lines for cluster medians
        for i, cluster in enumerate(unique_clusters):
            median = cluster_stats[cluster_stats['cluster'] == cluster]['Median'].values[0]
            plt.axvline(x=median, color=colors[i], linestyle='--', alpha=0.7)
        
        # Add reference lines for common prescription durations
        plt.axvline(x=30, color='black', linestyle=':', alpha=0.5, label='30 days')
        plt.axvline(x=90, color='black', linestyle='-.', alpha=0.5, label='90 days')
        
        plt.title(f"Cluster Analysis of Prescription Intervals for {medication_code}")
        plt.xlabel("Interval (days)")
        plt.ylabel("ECDF")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(f"{medication_code}_clusters.png")
        plt.show()
        
        # Create histogram with clusters
        plt.figure(figsize=(12, 6))
        
        for i, cluster in enumerate(unique_clusters):
            cluster_data = dfper[dfper['cluster'] == cluster]
            plt.hist(cluster_data['x'], bins=20, alpha=0.5, 
                    color=colors[i], label=f"Cluster {cluster}")
        
        # Add vertical lines for cluster medians
        for i, cluster in enumerate(unique_clusters):
            median = cluster_stats[cluster_stats['cluster'] == cluster]['Median'].values[0]
            plt.axvline(x=median, color=colors[i], linestyle='--', 
                       label=f"Median C{cluster}: {median:.1f}")
        
        plt.title(f"Distribution of Intervals by Cluster for {medication_code}")
        plt.xlabel("Interval (days)")
        plt.ylabel("Frequency")
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(f"{medication_code}_cluster_histogram.png")
        plt.show()

SECTION 5. The core function that is used in analyzing the medication adherance patterns is found here. The function here also helps in identifying clusters of similar refill behaviours of the patients and the like.

In [None]:
def analyze_medication(self, medication_code):
        """
        Analyze adherence patterns for a medication
        Similar to See() function in original code
        """
        # Filter for the specified medication
        med_subset = self.medication_data[self.medication_data['ATC'] == medication_code].copy()
        
        # Store original dataset
        drug_p0 = med_subset.copy()
        drug_p1 = med_subset.copy()
        
        # Calculate previous prescription date
        drug_p1 = drug_p1.sort_values(by=['pnr', 'eksd'])
        drug_p1['prev_eksd'] = drug_p1.groupby('pnr')['eksd'].shift(1)
        
        # Remove rows with no previous date
        drug_p1 = drug_p1.dropna(subset=['prev_eksd']).copy()
        
        # Sample 1 row per patient
        sampled_rows = []
        for name, group in drug_p1.groupby('pnr'):
            sampled_rows.append(group.sample(1, random_state=1234))
        drug_p1 = pd.concat(sampled_rows).reset_index(drop=True)
        
        # Keep needed columns and calculate interval
        drug_p1 = drug_p1[['pnr', 'eksd', 'prev_eksd']].copy()
        drug_p1['event_interval'] = (drug_p1['eksd'] - drug_p1['prev_eksd']).dt.days
        
        # Generate ECDF data
        valid_intervals = drug_p1[drug_p1['event_interval'] > 0]['event_interval']
        sorted_vals = np.sort(valid_intervals)
        ecdf_y = np.arange(1, len(sorted_vals) + 1) / len(sorted_vals)
        dfper = pd.DataFrame({'x': sorted_vals, 'y': ecdf_y})
        
        # Get 80% ECDF
        dfper_80 = dfper[dfper['y'] <= 0.8].copy()
        ni = dfper_80['x'].max() if not dfper_80.empty else dfper['x'].max()
        
        # Plot ECDFs
        self._plot_ecdf(dfper_80, sorted_vals, ecdf_y, medication_code)
        
        # Plot patient counts
        self._plot_patient_counts(drug_p1, medication_code)
        
        # Filter interval data and calculate log values
        drug_p2 = drug_p1[drug_p1['event_interval'] <= ni].copy()
        drug_p2 = drug_p2[drug_p2['event_interval'] > 0].copy()
        
        # Density estimation of log intervals
        log_vals = np.log(drug_p2['event_interval'])
        self._plot_log_density(log_vals, medication_code)
        
        # Calculate optimal number of clusters
        X_cluster = dfper['x'].values.reshape(-1, 1)
        max_cluster = self._find_optimal_clusters(X_cluster)
        
        # Perform KMeans clustering
        kmeans = KMeans(n_clusters=max_cluster, random_state=1234)
        dfper['cluster'] = kmeans.fit_predict(X_cluster)
        
        # Calculate cluster statistics
        dfper['log_x'] = np.log(dfper['x'])
        cluster_stats = dfper.groupby('cluster')['log_x'].agg(['min', 'max', 'median']).reset_index()
        
        # Convert back from log scale
        cluster_stats['Minimum'] = np.exp(cluster_stats['min'])
        cluster_stats['Maximum'] = np.exp(cluster_stats['max'])
        cluster_stats['Median'] = np.exp(cluster_stats['median'])
        cluster_stats = cluster_stats[cluster_stats['Median'] > 0].copy()
        
        # Create cross-join with intervals
        drug_p1['tmp_key'] = 1
        cluster_stats['tmp_key'] = 1
        results = pd.merge(drug_p1, cluster_stats, on='tmp_key').drop('tmp_key', axis=1)
        
        # Define which cluster each interval belongs to
        results['Final_cluster'] = np.where(
            (results['event_interval'] >= results['Minimum']) & 
            (results['event_interval'] <= results['Maximum']),
            results['cluster'], 
            np.nan
        )
        
        # Filter out rows without a cluster assignment
        results = results.dropna(subset=['Final_cluster']).copy()
        results = results[['pnr', 'Median', 'cluster']].rename(columns={'cluster': 'Cluster'})
        
        # Find most frequent cluster
        if not results.empty:
            cluster_freq = results['Cluster'].value_counts()
            most_freq_cluster = cluster_freq.idxmax()
            median_val = cluster_stats[cluster_stats['cluster'] == most_freq_cluster]['Median'].iloc[0]
        else:
            median_val = np.nan
        
        # Merge results back to drug_p1
        drug_p1 = pd.merge(drug_p1, results, on='pnr', how='left')
        drug_p1['Median'] = drug_p1['Median'].fillna(median_val)
        drug_p1['Cluster'] = drug_p1['Cluster'].fillna(0)
        
        # Calculate difference between interval and median
        drug_p1['test'] = np.round(drug_p1['event_interval'] - drug_p1['Median'], 1)
        
        # Create final result by merging back to original data
        drug_p3 = drug_p1[['pnr', 'Median', 'Cluster']].copy()
        drug_p0 = pd.merge(drug_p0, drug_p3, on='pnr', how='left')
        drug_p0['Median'] = drug_p0['Median'].fillna(median_val)
        drug_p0['Cluster'] = drug_p0['Cluster'].fillna(0)
        
        # Summarize the results
        print(f"\nSummary of results for {medication_code}:")
        print(f"Total patients: {drug_p0['pnr'].nunique()}")
        print(f"Total prescriptions: {len(drug_p0)}")
        print(f"Mean prescription interval: {drug_p1['event_interval'].mean():.1f} days")
        print(f"Median prescription interval: {drug_p1['event_interval'].median():.1f} days")
        print(f"Number of clusters identified: {max_cluster}")
        print(f"Dominant cluster interval: {median_val:.1f} days")
        
        # Analyze clusters in detail
        print("\nDetailed Cluster Analysis:")
        cluster_stats['Count'] = dfper.groupby('cluster').size().values
        cluster_stats['Percentage'] = (cluster_stats['Count'] / len(dfper) * 100).round(1)
        
        # Sort by median for easier interpretation
        cluster_stats = cluster_stats.sort_values('Median')
        
        # Print cluster statistics
        print("\nCluster Characteristics:")
        for idx, row in cluster_stats.iterrows():
            cluster_id = row['cluster']
            count = row['Count']
            percentage = row['Percentage']
            median = row['Median']
            min_val = row['Minimum']
            max_val = row['Maximum']
            
            print(f"Cluster {cluster_id}: {count} patients ({percentage}%)")
            print(f"  - Interval range: {min_val:.1f} to {max_val:.1f} days")
            print(f"  - Median interval: {median:.1f} days")
            
            # Interpret the cluster
            if median < 20:
                print("  - Interpretation: Very frequent refills, possible underdosing or medication sharing")
            elif 20 <= median < 40:
                print("  - Interpretation: Monthly prescription pattern (typical 30-day supply)")
            elif 40 <= median < 65:
                print("  - Interpretation: Extended monthly pattern, possible bi-monthly schedule")
            elif 65 <= median < 100:
                print("  - Interpretation: Quarterly prescription pattern (typical 90-day supply)")
            else:
                print("  - Interpretation: Infrequent refills, possible non-adherence or intermittent use")
        
        # Create a visualization of the clusters
        self._plot_clusters(dfper, cluster_stats, medication_code)
        
        return drug_p0

SECTION 6. This function is meant for creating "boxplots" that show how the prescription durations change by prescription number (e.g., 2nd, 3rd, etc.).

In [None]:
def analyze_prescription_sequence(self, medication_data, medication_code):
        """
        Create boxplot of duration by prescription number
        Equivalent to see_assumption function
        """
        # Sort data and calculate previous prescription date
        df_sorted = medication_data.sort_values(by=['pnr', 'eksd']).copy()
        df_sorted['prev_eksd'] = df_sorted.groupby('pnr')['eksd'].shift(1)
        
        # Create prescription sequence number
        df_sorted['p_number'] = df_sorted.groupby('pnr').cumcount() + 1
        
        # Filter for prescriptions with previous date (p_number >= 2)
        subset = df_sorted[df_sorted['p_number'] >= 2].copy()
        subset = subset[['pnr', 'eksd', 'prev_eksd', 'p_number']].copy()
        
        # Calculate duration between prescriptions
        subset['Duration'] = (subset['eksd'] - subset['prev_eksd']).dt.days
        
        # Convert p_number to string for plotting
        subset['p_number'] = subset['p_number'].astype(str)
        
        # Calculate patient-level medians and overall median
        patient_medians = subset.groupby('pnr')['Duration'].median()
        overall_median = patient_medians.median()
        
        # Create boxplot
        plt.figure(figsize=(12, 6))
        ax = sns.boxplot(x='p_number', y='Duration', data=subset)
        
        # Add reference line for overall median
        plt.axhline(y=overall_median, color='red', linestyle='--', 
                   label=f"Median: {overall_median:.2f}")
        
        plt.title(f"Boxplot of Duration by p_number ({medication_code})")
        plt.xlabel("Prescription Number")
        plt.ylabel("Duration (days)")
        plt.legend()
        plt.grid(True, axis='y', alpha=0.3)
        plt.tight_layout()
        plt.savefig(f"{medication_code}_boxplot.png")
        plt.show()
        
        # Analyze the boxplot results
        print(f"\nAnalysis of prescription durations for {medication_code}:")
        print(f"Overall median duration: {overall_median:.1f} days")
        
        # Calculate statistics by prescription number
        stats_by_p_number = subset.groupby('p_number')['Duration'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
        print("\nDuration statistics by prescription number:")
        print(stats_by_p_number)
        
        # Identify trends in the durations
        early_prescriptions = subset[subset['p_number'].isin(['2', '3', '4'])]
        later_prescriptions = subset[~subset['p_number'].isin(['2', '3', '4'])]
        
        if len(early_prescriptions) > 0 and len(later_prescriptions) > 0:
            early_median = early_prescriptions['Duration'].median()
            later_median = later_prescriptions['Duration'].median()
            
            if early_median > later_median * 1.1:
                print("\nTrend observed: Earlier prescriptions have longer durations than later ones.")
                print(f"Early prescriptions median: {early_median:.1f} days")
                print(f"Later prescriptions median: {later_median:.1f} days")
                print("This may indicate stabilization of medication regimen over time.")
            elif later_median > early_median * 1.1:
                print("\nTrend observed: Later prescriptions have longer durations than earlier ones.")
                print(f"Early prescriptions median: {early_median:.1f} days")
                print(f"Later prescriptions median: {later_median:.1f} days")
                print("This may indicate reduced adherence or increased dosing intervals over time.")
            else:
                print("\nNo significant trend observed between early and later prescription durations.")
        
        # Analyze adherence based on expected prescription durations
        expected_duration = 30  # Assuming standard 30-day prescriptions
        adherent = subset[subset['Duration'].between(expected_duration*0.8, expected_duration*1.2)]
        early_refill = subset[subset['Duration'] < expected_duration*0.8]
        late_refill = subset[subset['Duration'] > expected_duration*1.2]
        
        print(f"\nAdherence analysis (assuming {expected_duration}-day intended duration):")
        print(f"Adherent refills: {len(adherent)} ({len(adherent)/len(subset)*100:.1f}%)")
        print(f"Early refills: {len(early_refill)} ({len(early_refill)/len(subset)*100:.1f}%)")
        print(f"Late refills: {len(late_refill)} ({len(late_refill)/len(subset)*100:.1f}%)")
        
        if len(adherent)/len(subset) < 0.6:
            print("Poor overall adherence detected. Consider investigating reasons for irregular refills.")
        
        return subset

SECTION 7. This section holds the function that runs the complete analysis of both the medications and also helps in providing a comparative interpretation between the two.

In [None]:
def main():
    """Main function to run the analysis"""
    # Create analyzer
    analyzer = MedicationAnalyzer()
    
    # Analyze medications
    print("\nAnalyzing medication A (medA)...")
    medA_results = analyzer.analyze_medication("medA")
    
    print("\nAnalyzing medication B (medB)...")
    medB_results = analyzer.analyze_medication("medB")
    
    # Create prescription sequence boxplots
    print("\nCreating prescription sequence boxplot for medA...")
    medA_sequences = analyzer.analyze_prescription_sequence(medA_results, "medA")
    
    print("\nCreating prescription sequence boxplot for medB...")
    medB_sequences = analyzer.analyze_prescription_sequence(medB_results, "medB")
    
    # Compare medications
    print("\n" + "="*50)
    print("COMPARISON BETWEEN MEDICATIONS")
    print("="*50)
    
    medA_median = medA_sequences['Duration'].median()
    medB_median = medB_sequences['Duration'].median()
    
    print(f"medA median duration: {medA_median:.1f} days")
    print(f"medB median duration: {medB_median:.1f} days")
    
    if abs(medA_median - medB_median) > 5:
        print(f"\nSignificant difference of {abs(medA_median - medB_median):.1f} days between medications.")
        if medA_median > medB_median:
            print("medA has longer durations between prescriptions than medB.")
            print("This may indicate: ")
            print("- Different dosing schedules")
            print("- Different patient populations")
            print("- Different adherence patterns")
        else:
            print("medB has longer durations between prescriptions than medA.")
            print("This may indicate: ")
            print("- Different dosing schedules")
            print("- Different patient populations")
            print("- Different adherence patterns")
    else:
        print("\nNo significant difference in durations between medications.")
        print("The medications show similar refill patterns despite potential differences in therapeutic use.")
    
    # Calculate variability
    medA_cv = medA_sequences['Duration'].std() / medA_sequences['Duration'].mean()
    medB_cv = medB_sequences['Duration'].std() / medB_sequences['Duration'].mean()
    
    print(f"\nCoefficient of variation (CV) for medA: {medA_cv:.2f}")
    print(f"Coefficient of variation (CV) for medB: {medB_cv:.2f}")
    
    if abs(medA_cv - medB_cv) > 0.1:
        print("\nSignificant difference in variability between medications.")
        if medA_cv > medB_cv:
            print("medA shows more variable prescription patterns than medB.")
            print("This may indicate less consistent use or more dose adjustments.")
        else:
            print("medB shows more variable prescription patterns than medA.")
            print("This may indicate less consistent use or more dose adjustments.")
    else:
        print("\nBoth medications show similar variability in prescription patterns.")
    
    # Overall conclusions
    print("\n" + "="*50)
    print("OVERALL CONCLUSIONS")
    print("="*50)
    
    print("1. Medication Adherence Patterns:")
    total_early_medA = len(medA_sequences[medA_sequences['Duration'] < 25])
    total_early_medB = len(medB_sequences[medB_sequences['Duration'] < 25])
    
    early_percent_medA = total_early_medA / len(medA_sequences) * 100
    early_percent_medB = total_early_medB / len(medB_sequences) * 100
    
    print(f"   - Early refills (< 25 days): medA {early_percent_medA:.1f}%, medB {early_percent_medB:.1f}%")
    
    print("2. Prescription Stability:")
    medA_early_cv = medA_sequences[medA_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].std() / medA_sequences[medA_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].mean()
    medA_late_cv = medA_sequences[~medA_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].std() / medA_sequences[~medA_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].mean()
    
    medB_early_cv = medB_sequences[medB_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].std() / medB_sequences[medB_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].mean()
    medB_late_cv = medB_sequences[~medB_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].std() / medB_sequences[~medB_sequences['p_number'].isin(['2', '3', '4'])]['Duration'].mean()
    
    print(f"   - medA: Early CV {medA_early_cv:.2f}, Late CV {medA_late_cv:.2f}")
    print(f"   - medB: Early CV {medB_early_cv:.2f}, Late CV {medB_late_cv:.2f}")
    
    if medA_early_cv > medA_late_cv and medB_early_cv > medB_late_cv:
        print("   - Both medications show stabilization of prescription patterns over time")
    
    print("3. Cluster Implications:")
    print("   - Identified clusters represent distinct prescription refill behaviors")
    print("   - Common clusters around 30 and 90 days align with standard prescription durations")
    print("   - Clusters with very short intervals may indicate dosage issues or medication sharing")
    print("   - Clusters with very long intervals may indicate intermittent use or poor adherence")
    
    print("4. Recommendations:")
    print("   - Monitor patients with highly variable refill patterns for potential adherence issues")
    print("   - Consider patient education on medication adherence importance")
    print("   - Investigate factors contributing to early or late refills")
    print("   - Create targeted interventions for patients in problematic clusters")
    
    print("\nAnalysis complete!")

In [None]:
SECTION 8. Code to run the analysis.

In [None]:
# Execute the main function to run the complete analysis
if __name__ == "__main__":
    main()

SECTION 9. Callable functions for "Testing". 

In [None]:
# Define callable functions to verify each analysis step
def test_data_loading():
    """Test data loading step"""
    analyzer = MedicationAnalyzer()
    print(f"Data loaded successfully with {len(analyzer.medication_data)} records")
    print(f"Data columns: {analyzer.medication_data.columns.tolist()}")
    print("\nFirst 5 rows:")
    print(analyzer.medication_data.head())
    return analyzer.medication_data is not None

def test_med_analysis(medication_code="medA"):
    """Test medication analysis step"""
    analyzer = MedicationAnalyzer()
    results = analyzer.analyze_medication(medication_code)
    print(f"\nMedication analysis returned {len(results)} records")
    print(f"Unique patient count: {results['pnr'].nunique()}")
    print(f"Cluster assignment successful: {results['Cluster'].notna().all()}")
    return results is not None

def test_prescription_sequence(medication_code="medA"):
    """Test prescription sequence analysis"""
    analyzer = MedicationAnalyzer()
    med_results = analyzer.analyze_medication(medication_code)
    seq_results = analyzer.analyze_prescription_sequence(med_results, medication_code)
    print(f"\nSequence analysis returned {len(seq_results)} records")
    print(f"Duration statistics by prescription number:")
    print(seq_results.groupby('p_number')['Duration'].agg(['count', 'mean', 'median']).head())
    return seq_results is not None

def test_full_analysis():
    """Test full medication analysis workflow"""
    analyzer = MedicationAnalyzer()
    medA_results = analyzer.analyze_medication("medA")
    medB_results = analyzer.analyze_medication("medB")
    medA_sequences = analyzer.analyze_prescription_sequence(medA_results, "medA")
    medB_sequences = analyzer.analyze_prescription_sequence(medB_results, "medB")
    
    print("\nFull analysis completed successfully!")
    print(f"medA median duration: {medA_sequences['Duration'].median():.1f} days")
    print(f"medB median duration: {medB_sequences['Duration'].median():.1f} days")
    return True

# Run the tests
print("Testing data loading...")
data_loaded = test_data_loading()

print("\nTesting medication analysis...")
med_analysis_ok = test_med_analysis()

print("\nTesting prescription sequence analysis...")
seq_analysis_ok = test_prescription_sequence()

print("\nTesting full analysis workflow...")
full_analysis_ok = test_full_analysis()

print("\nAll tests completed.")
print(f"Data loading: {'✓' if data_loaded else '✗'}")
print(f"Medication analysis: {'✓' if med_analysis_ok else '✗'}")
print(f"Sequence analysis: {'✓' if seq_analysis_ok else '✗'}")
print(f"Full analysis: {'✓' if full_analysis_ok else '✗'}")

SECTION 10. Validating the results and insights.

In [None]:
def validate_clinical_insights():
    """Validate that our analysis produces the expected clinical insights"""
    # Run a simplified analysis
    analyzer = MedicationAnalyzer()
    medA_results = analyzer.analyze_medication("medA")
    medA_sequences = analyzer.analyze_prescription_sequence(medA_results, "medA")
    
    # Check adherence patterns
    expected_duration = 30  # Standard 30-day supply
    adherent = medA_sequences[medA_sequences['Duration'].between(expected_duration*0.8, expected_duration*1.2)]
    early_refill = medA_sequences[medA_sequences['Duration'] < expected_duration*0.8]
    late_refill = medA_sequences[medA_sequences['Duration'] > expected_duration*1.2]
    
    adherence_rate = len(adherent)/len(medA_sequences)*100
    
    # Get cluster information
    cluster_counts = medA_results['Cluster'].value_counts()
    dominant_cluster = cluster_counts.idxmax()
    
    # Validate expected insights
    insights = {
        "Patient count identified": medA_results['pnr'].nunique() > 0,
        "Prescription sequence patterns detected": len(medA_sequences) > 0,
        "Clusters identified": len(cluster_counts) > 1,
        "Adherence patterns calculated": True,
        "Early/late refills detected": len(early_refill) + len(late_refill) > 0
    }
    
    print("Clinical Insight Validation:")
    for insight, valid in insights.items():
        print(f"{insight}: {'✓' if valid else '✗'}")
    
    print(f"\nAdherence rate: {adherence_rate:.1f}%")
    print(f"Dominant cluster: {dominant_cluster}")
    print(f"Number of clusters identified: {len(cluster_counts)}")
    
    # Overall validation
    is_valid = all(insights.values())
    return is_valid

# Run validation
insights_valid = validate_clinical_insights()
print(f"\nOverall validation: {'✓' if insights_valid else '✗'}")