# üéØ AURA Project - Technical Skills Showcase

**Data Science Capstone | Milestone 1 Demonstration**

---

This notebook showcases key data science techniques applied to the NSMES1988 healthcare dataset, demonstrating proficiency in:

1. **Data Import & Quality Assessment**
2. **Memory Optimization Strategies**
3. **Statistical Analysis**
4. **Advanced Pandas Operations**
5. **Data Visualization**

---

## üì¶ Environment Setup

In [None]:
# Core Libraries
import pandas as pd
import numpy as np
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries loaded successfully")
print(f"   pandas: {pd.__version__}")
print(f"   numpy: {np.__version__}")

---

## 1Ô∏è‚É£ Data Import & Quality Assessment

### 1.1 Loading the Dataset

In [None]:
# Load the healthcare dataset
df = pd.read_csv('NSMES1988.csv')

# Quick overview
print(f"üìä Dataset Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"üì¶ Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

### 1.2 Data Structure Analysis

In [None]:
# Comprehensive data type summary
def analyze_dtypes(dataframe):
    """Analyze and summarize data types in a DataFrame."""
    dtype_summary = pd.DataFrame({
        'dtype': dataframe.dtypes,
        'non_null': dataframe.count(),
        'null_count': dataframe.isnull().sum(),
        'null_pct': (dataframe.isnull().sum() / len(dataframe) * 100).round(2),
        'unique': dataframe.nunique(),
        'sample': dataframe.iloc[0]
    })
    return dtype_summary

analyze_dtypes(df)

### 1.3 Missing Value Assessment

In [None]:
# Check for missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

if missing.sum() == 0:
    print("‚úÖ No missing values detected - dataset is complete!")
else:
    print("‚ö†Ô∏è Missing values found:")
    print(missing[missing > 0])

---

## 2Ô∏è‚É£ Memory Optimization

### 2.1 Memory Analysis Before Optimization

In [None]:
def memory_usage_report(dataframe, name="DataFrame"):
    """Generate a detailed memory usage report."""
    memory_per_col = dataframe.memory_usage(deep=True)[1:]  # Exclude index
    total_memory = memory_per_col.sum()
    
    report = pd.DataFrame({
        'dtype': dataframe.dtypes,
        'memory_bytes': memory_per_col,
        'memory_kb': (memory_per_col / 1024).round(2),
        'pct_of_total': (memory_per_col / total_memory * 100).round(2)
    }).sort_values('memory_bytes', ascending=False)
    
    print(f"üìä {name} Memory Report")
    print(f"   Total: {total_memory / 1024:.2f} KB ({total_memory / 1024 / 1024:.2f} MB)")
    return report

memory_before = memory_usage_report(df, "Original DataFrame")

### 2.2 Optimization Strategy

**Key Techniques:**
- Convert object columns to category dtype for low-cardinality strings
- Downcast numeric columns to smallest viable type
- Remove redundant index columns

In [None]:
def optimize_dataframe(dataframe):
    """Optimize DataFrame memory usage."""
    df_opt = dataframe.copy()
    
    # Remove unnamed index column if present
    if 'Unnamed: 0' in df_opt.columns:
        df_opt = df_opt.drop('Unnamed: 0', axis=1)
    
    # Convert object columns with low cardinality to category
    for col in df_opt.select_dtypes(include=['object']).columns:
        if df_opt[col].nunique() / len(df_opt) < 0.5:  # Less than 50% unique
            df_opt[col] = df_opt[col].astype('category')
    
    # Downcast integers
    for col in df_opt.select_dtypes(include=['int64']).columns:
        df_opt[col] = pd.to_numeric(df_opt[col], downcast='integer')
    
    # Downcast floats
    for col in df_opt.select_dtypes(include=['float64']).columns:
        df_opt[col] = pd.to_numeric(df_opt[col], downcast='float')
    
    return df_opt

df_optimized = optimize_dataframe(df)
print("‚úÖ Optimization complete!")

### 2.3 Memory Savings Analysis

In [None]:
# Compare memory usage
mem_before = df.memory_usage(deep=True).sum()
mem_after = df_optimized.memory_usage(deep=True).sum()
savings = (1 - mem_after / mem_before) * 100

print(f"üìä Memory Optimization Results")
print(f"   Before: {mem_before / 1024:.2f} KB")
print(f"   After:  {mem_after / 1024:.2f} KB")
print(f"   Savings: {savings:.1f}%")

---

## 3Ô∏è‚É£ Statistical Analysis

### 3.1 Descriptive Statistics

In [None]:
# Comprehensive descriptive statistics for numeric columns
numeric_cols = df_optimized.select_dtypes(include=[np.number]).columns.tolist()

# Create custom statistics function
def custom_describe(dataframe, columns):
    """Generate custom descriptive statistics."""
    stats_dict = {
        'mean': dataframe[columns].mean(),
        'median': dataframe[columns].median(),
        'std': dataframe[columns].std(),
        'min': dataframe[columns].min(),
        'max': dataframe[columns].max(),
        'range': dataframe[columns].max() - dataframe[columns].min(),
        'skewness': dataframe[columns].skew(),
        'kurtosis': dataframe[columns].kurtosis()
    }
    return pd.DataFrame(stats_dict).T.round(3)

custom_describe(df_optimized, numeric_cols)

### 3.2 Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = df_optimized[numeric_cols].corr()

# Find strong correlations (|r| > 0.5)
def find_strong_correlations(corr_matrix, threshold=0.5):
    """Identify pairs with correlation above threshold."""
    strong_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                strong_corr.append({
                    'var1': corr_matrix.columns[i],
                    'var2': corr_matrix.columns[j],
                    'correlation': round(corr_matrix.iloc[i, j], 3)
                })
    return pd.DataFrame(strong_corr).sort_values('correlation', key=abs, ascending=False)

strong_correlations = find_strong_correlations(correlation_matrix)
print("üîó Strong Correlations (|r| > 0.5):")
strong_correlations

---

## 4Ô∏è‚É£ Advanced Pandas Operations

### 4.1 GroupBy Analysis

In [None]:
# Healthcare utilization by health status and gender
utilization_analysis = df_optimized.groupby(['health', 'gender']).agg({
    'visits': ['mean', 'sum', 'count'],
    'hospital': ['mean', 'sum'],
    'emergency': ['mean', 'sum']
}).round(2)

# Flatten column names
utilization_analysis.columns = ['_'.join(col).strip() for col in utilization_analysis.columns.values]
utilization_analysis

### 4.2 Pivot Table Analysis

In [None]:
# Multi-dimensional analysis using pivot tables
pivot_table = pd.pivot_table(
    df_optimized,
    values='visits',
    index='health',
    columns='region',
    aggfunc=['mean', 'count'],
    margins=True,
    margins_name='Total'
).round(2)

print("üìä Healthcare Visits by Health Status and Region")
pivot_table

### 4.3 Cross-Tabulation

In [None]:
# Insurance coverage analysis
insurance_crosstab = pd.crosstab(
    df_optimized['health'],
    df_optimized['insurance'],
    margins=True,
    normalize='index'  # Row percentages
).round(3) * 100

print("üìä Insurance Coverage by Health Status (Row %)")
insurance_crosstab

---

## 5Ô∏è‚É£ Data Visualization

### 5.1 Distribution Analysis

In [None]:
# Create a comprehensive distribution plot
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Age distribution
sns.histplot(data=df_optimized, x='age', kde=True, ax=axes[0, 0], color='steelblue')
axes[0, 0].set_title('Age Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Age (decades)')

# Income distribution
sns.histplot(data=df_optimized, x='income', kde=True, ax=axes[0, 1], color='coral')
axes[0, 1].set_title('Income Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Income (log scale)')

# Visits by health status
sns.boxplot(data=df_optimized, x='health', y='visits', ax=axes[1, 0], palette='Set2')
axes[1, 0].set_title('Visits by Health Status', fontsize=12, fontweight='bold')

# Chronic conditions distribution
sns.countplot(data=df_optimized, x='chronic', ax=axes[1, 1], palette='viridis')
axes[1, 1].set_title('Chronic Conditions Count', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('distribution_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úÖ Figure saved: distribution_analysis.png")

### 5.2 Correlation Heatmap

In [None]:
# Create correlation heatmap
plt.figure(figsize=(12, 10))

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(
    correlation_matrix,
    mask=mask,
    annot=True,
    fmt='.2f',
    cmap='RdBu_r',
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={'shrink': 0.8}
)

plt.title('Correlation Matrix - Healthcare Variables', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úÖ Figure saved: correlation_heatmap.png")

### 5.3 Healthcare Utilization Dashboard

In [None]:
# Create a comprehensive dashboard view
fig = plt.figure(figsize=(14, 10))

# 1. Visits by health status and gender
ax1 = fig.add_subplot(2, 2, 1)
health_gender = df_optimized.groupby(['health', 'gender'])['visits'].mean().unstack()
health_gender.plot(kind='bar', ax=ax1, color=['#3498db', '#e74c3c'])
ax1.set_title('Avg Visits by Health Status & Gender', fontweight='bold')
ax1.set_xlabel('')
ax1.legend(title='Gender')
ax1.tick_params(axis='x', rotation=0)

# 2. Insurance coverage pie chart
ax2 = fig.add_subplot(2, 2, 2)
insurance_counts = df_optimized['insurance'].value_counts()
ax2.pie(insurance_counts, labels=insurance_counts.index, autopct='%1.1f%%', 
        colors=['#27ae60', '#e74c3c'], startangle=90)
ax2.set_title('Insurance Coverage Distribution', fontweight='bold')

# 3. Emergency visits by region
ax3 = fig.add_subplot(2, 2, 3)
region_emergency = df_optimized.groupby('region')['emergency'].sum()
region_emergency.plot(kind='barh', ax=ax3, color='#9b59b6')
ax3.set_title('Total Emergency Visits by Region', fontweight='bold')
ax3.set_xlabel('Emergency Visits')

# 4. Age vs Visits scatter
ax4 = fig.add_subplot(2, 2, 4)
scatter = ax4.scatter(df_optimized['age'], df_optimized['visits'], 
                      c=df_optimized['chronic'], cmap='YlOrRd', alpha=0.5, s=30)
ax4.set_xlabel('Age (decades)')
ax4.set_ylabel('Number of Visits')
ax4.set_title('Age vs Visits (colored by chronic conditions)', fontweight='bold')
plt.colorbar(scatter, ax=ax4, label='Chronic Conditions')

plt.suptitle('NSMES1988 Healthcare Utilization Dashboard', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('healthcare_dashboard.png', dpi=150, bbox_inches='tight')
plt.show()
print("‚úÖ Figure saved: healthcare_dashboard.png")

---

## üìã Summary

### Key Findings

1. **Data Quality**: The NSMES1988 dataset is clean with no missing values across 4,406 records

2. **Memory Optimization**: Achieved significant memory reduction through categorical conversion and numeric downcasting

3. **Healthcare Patterns**: Clear correlation between health status and healthcare utilization metrics

4. **Demographic Insights**: Age and chronic conditions are key predictors of healthcare visits

### Technical Skills Demonstrated

| Skill Area | Techniques Used |
|------------|----------------|
| Data Wrangling | Type optimization, memory management, data cleaning |
| Statistical Analysis | Descriptive stats, correlation analysis, custom functions |
| Pandas Operations | GroupBy, pivot tables, cross-tabulation, aggregations |
| Visualization | Distribution plots, heatmaps, dashboards, multi-panel figures |

---

*Part of the AURA Capstone Project | Milestone 1 - Data Analysis*