In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-whitegrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

print("âœ… Libraries imported successfully!")

In [None]:
# Load data
df = pd.read_csv('../../data/raw/NSMES1988.csv')

print(f"Dataset loaded: {df.shape}")
df.head()

## Data Cleaning

In [None]:
# Clean data
df_clean = df.copy()
df_clean = df_clean.drop_duplicates()
df_clean = df_clean.dropna()

print(f"Cleaned dataset: {df_clean.shape}")
print(f"\nMissing values: {df_clean.isnull().sum().sum()}")

## Visualization 1: Health Status Distribution (Pie Chart)

In [None]:
# Health Status Pie Chart
plt.figure(figsize=(10, 8))

health_counts = df_clean['health'].value_counts()
colors = sns.color_palette('husl', n_colors=len(health_counts))

plt.pie(health_counts, labels=health_counts.index, autopct='%1.1f%%',
        startangle=90, colors=colors, explode=[0.05]*len(health_counts))

plt.title('Health Status Distribution', fontsize=16, fontweight='bold', pad=20)
plt.axis('equal')
plt.tight_layout()
plt.show()

print("\nHealth Status Counts:")
print(health_counts)

## Visualization 2: Income by Region (Bar Chart)

In [None]:
# Income by Region Bar Chart
plt.figure(figsize=(12, 6))

income_by_region = df_clean.groupby('region')['income'].mean().sort_values(ascending=False)

ax = sns.barplot(x=income_by_region.index, y=income_by_region.values, palette='viridis')

plt.title('Average Income by Region', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Region', fontsize=12, fontweight='bold')
plt.ylabel('Average Income ($)', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')

for i, v in enumerate(income_by_region.values):
    ax.text(i, v + v*0.02, f'${v:,.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\nAverage Income by Region:")
print(income_by_region)

## Visualization 3: Age vs Visits Correlation (Scatter Plot)

In [None]:
# Age vs Visits Scatter Plot
plt.figure(figsize=(12, 8))

sns.scatterplot(data=df_clean, x='age', y='visits', alpha=0.6, s=50)
sns.regplot(data=df_clean, x='age', y='visits', scatter=False, color='red', line_kws={'linewidth': 2})

plt.title('Age vs Doctor Visits Correlation', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Age (years)', fontsize=12, fontweight='bold')
plt.ylabel('Number of Doctor Visits', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)

correlation = df_clean['age'].corr(df_clean['visits'])
plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}',
         transform=plt.gca().transAxes, fontsize=12,
         verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

print(f"\nCorrelation: {correlation:.4f}")

## Additional Visualizations

In [None]:
# Age Distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].hist(df_clean['age'], bins=30, edgecolor='black', alpha=0.7, color='skyblue')
axes[0].set_title('Age Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Frequency')

sns.boxplot(data=df_clean, y='age', ax=axes[1], color='lightcoral')
axes[1].set_title('Age Box Plot', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Visits by Health Status
plt.figure(figsize=(12, 6))

sns.boxplot(data=df_clean, x='health', y='visits', palette='Set2')
plt.title('Doctor Visits by Health Status', fontsize=16, fontweight='bold', pad=20)
plt.xlabel('Health Status', fontsize=12, fontweight='bold')
plt.ylabel('Number of Visits', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 8))

numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
correlation_matrix = df_clean[numerical_cols].corr()

sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})

plt.title('Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
# Summary
print("="*70)
print("SESSION 4: VISUALIZATION ANALYSIS - SUMMARY")
print("="*70)

print(f"\nðŸ“Š Dataset: {len(df_clean):,} records after cleaning")
print(f"\nðŸ“ˆ Key Findings:")
print(f"   â€¢ Age range: {df_clean['age'].min():.1f} - {df_clean['age'].max():.1f} years")
print(f"   â€¢ Average visits: {df_clean['visits'].mean():.2f}")
print(f"   â€¢ Age-Visits correlation: {df_clean['age'].corr(df_clean['visits']):.3f}")
print(f"   â€¢ Health categories: {df_clean['health'].nunique()}")
print(f"   â€¢ Regions analyzed: {df_clean['region'].nunique()}")

print("\nâœ… All visualizations completed successfully!")
print("="*70)

In [None]:
# Save cleaned data
output_path = '../../data/processed/NSMES1988_clean.csv'
df_clean.to_csv(output_path, index=False)
print(f"âœ… Cleaned data saved to: {output_path}")