# 02 â€” NY Provider Analysis

Filter the dataset to New York providers and analyze billing patterns.

**Focus areas**:
- Provider distribution by facility type
- Billing volume by borough (NYC focus)
- Outlier detection with Z-scores

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import sys; sys.path.insert(0, '../backend')
from data_ingestion.transformer import normalize_state, filter_state, clean_currency

In [None]:
# Load and filter to NY
# df = pd.read_csv('../data/medicaid_claims.csv', nrows=500000)
# df = normalize_state(df)
# ny_df = filter_state(df, 'NY')

# Placeholder sample
ny_df = pd.DataFrame({
    'npi': ['1234567890'] * 5 + ['9876543210'] * 3,
    'facility_type': ['Nursing Home'] * 5 + ['Adult Day Care'] * 3,
    'amount': [1500, 1600, 1400, 1550, 1650, 8000, 9000, 8500],
    'billing_code': ['97110'] * 5 + ['T2024'] * 3,
    'city': ['Brooklyn'] * 5 + ['Queens'] * 3,
})
ny_df.head()

In [None]:
# Provider billing distribution
provider_totals = ny_df.groupby('npi')['amount'].sum()
mean_val = provider_totals.mean()
std_val = provider_totals.std()

print(f'Mean: ${mean_val:,.2f}')
print(f'Std Dev: ${std_val:,.2f}')

# Z-scores
z_scores = (provider_totals - mean_val) / std_val if std_val > 0 else provider_totals * 0
print('\nZ-scores per provider:')
print(z_scores)

In [None]:
# Facility type distribution
ny_df['facility_type'].value_counts().plot(kind='bar', title='Claims by Facility Type')
plt.tight_layout()
plt.show()