<a href="https://colab.research.google.com/github/joelkny97/ruhealth2024-team7/blob/develop/RUHealthHackOutlierDetect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


In [3]:
beneficiary_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RUHealthHack/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv")
inpatient_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RUHealthHack/DE1_0_2008_to_2010_Inpatient_Claims_Sample_1.csv")
outpatient_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RUHealthHack/DE1_0_2008_to_2010_Outpatient_Claims_Sample_1.csv",low_memory=False,)
carrier_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RUHealthHack/DE1_0_2008_to_2010_Carrier_Claims_Sample_1A.csv", low_memory=False)


chronic_conditions = [
    'SP_ALZHDMTA', 'SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD',
    'SP_DEPRESSN', 'SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS',
    'SP_RA_OA', 'SP_STRKETIA'
]





In [4]:
# Concatenate inpatient and outpatient claims with unique IDs
claims_data = pd.concat([inpatient_df[['DESYNPUF_ID', 'CLM_ID', 'CLM_PMT_AMT', 'CLM_FROM_DT']],
                         outpatient_df[['DESYNPUF_ID', 'CLM_ID', 'CLM_PMT_AMT', 'CLM_FROM_DT']]])
claims_data['CLM_FROM_DT'] = pd.to_datetime(claims_data['CLM_FROM_DT'])

# Merge chronic conditions with claims data
claims_data = claims_data.merge(beneficiary_df[['DESYNPUF_ID'] + chronic_conditions], on='DESYNPUF_ID', how='left')


numeric_features = ['CLM_PMT_AMT']


In [5]:
# Scaling numeric features
scaler = StandardScaler()
claims_data[numeric_features] = scaler.fit_transform(claims_data[numeric_features])

# Combine binary chronic conditions with scaled numeric features
X = claims_data[chronic_conditions + numeric_features].values

# Step 2: Split Data for Training and Testing
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

In [6]:
# Step 3: Define the Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the model, loss function, and optimizer
input_dim = X_train_tensor.shape[1]
model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 4: Train the Autoencoder
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, X_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 5: Evaluate Model and Calculate Reconstruction Error for Outlier Detection
model.eval()
with torch.no_grad():
    # Calculate reconstruction error on test set
    reconstructions = model(X_test_tensor)
    reconstruction_errors = torch.mean((reconstructions - X_test_tensor) ** 2, dim=1)






Epoch [10/50], Loss: 1.4518
Epoch [20/50], Loss: 1.3413
Epoch [30/50], Loss: 1.0840
Epoch [40/50], Loss: 0.7707
Epoch [50/50], Loss: 0.6683
Total Test Samples: 171513
Detected Outliers: 1418

Detected Outlier Samples:
   SP_ALZHDMTA  SP_CHF  SP_CHRNKIDN  SP_CNCR  SP_COPD  SP_DEPRESSN  \
0          1.0     1.0          2.0      2.0      1.0          2.0   
1          1.0     1.0          1.0      2.0      1.0          2.0   
2          2.0     2.0          1.0      2.0      2.0          1.0   
3          2.0     1.0          1.0      2.0      1.0          1.0   
4          2.0     1.0          1.0      2.0      1.0          1.0   

   SP_DIABETES  SP_ISCHMCHT  SP_OSTEOPRS  SP_RA_OA  SP_STRKETIA  CLM_PMT_AMT  
0          1.0          2.0          2.0       2.0          2.0    10.436005  
1          1.0          1.0          2.0       2.0          2.0     4.942322  
2          2.0          1.0          2.0       2.0          2.0    12.908162  
3          1.0          1.0          1.0     

NameError: name 'data' is not defined

In [10]:
from datetime import datetime
version = "v1.0"

# Create a timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Model file name based on timestamp, version, and "outlier" keyword
model_filename = f"/content/drive/MyDrive/Colab Notebooks/RUHealthHack/models/outlier_model_{version}_{timestamp}.pth"
torch.save(model.state_dict(), model_filename)
print(f"Model saved to {model_filename}")

Model saved to /content/drive/MyDrive/Colab Notebooks/RUHealthHack/models/outlier_model_v1.0_20241026_232314.pth


In [11]:
# Step 6: Determine Outlier Thresholds
# Set threshold dynamically (e.g., mean + 2 standard deviations of reconstruction errors)
threshold = reconstruction_errors.mean() + 2 * reconstruction_errors.std()

# Identify outliers based on reconstruction error
outliers = reconstruction_errors > threshold

# Display outlier detection results
print(f"Total Test Samples: {len(X_test)}")
print(f"Detected Outliers: {outliers.sum().item()}")

# Step 7: Detailed Analysis of Outliers
# Extract outliers for further inspection
outlier_indices = np.where(outliers)[0]
outlier_data = pd.DataFrame(X_test[outlier_indices], columns=chronic_conditions + numeric_features)

print("\nDetected Outlier Samples:")
print(outlier_data.head())

Total Test Samples: 171513
Detected Outliers: 1418

Detected Outlier Samples:
   SP_ALZHDMTA  SP_CHF  SP_CHRNKIDN  SP_CNCR  SP_COPD  SP_DEPRESSN  \
0          1.0     1.0          2.0      2.0      1.0          2.0   
1          1.0     1.0          1.0      2.0      1.0          2.0   
2          2.0     2.0          1.0      2.0      2.0          1.0   
3          2.0     1.0          1.0      2.0      1.0          1.0   
4          2.0     1.0          1.0      2.0      1.0          1.0   

   SP_DIABETES  SP_ISCHMCHT  SP_OSTEOPRS  SP_RA_OA  SP_STRKETIA  CLM_PMT_AMT  
0          1.0          2.0          2.0       2.0          2.0    10.436005  
1          1.0          1.0          2.0       2.0          2.0     4.942322  
2          2.0          1.0          2.0       2.0          2.0    12.908162  
3          1.0          1.0          1.0       2.0          2.0    15.380320  
4          1.0          1.0          2.0       2.0          2.0    15.380320  
