In [1]:

import numpy as np
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/healthcare-provider-fraud-detection-analysis/Test-1542969243754.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Beneficiarydata-1542865627584.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Inpatientdata-1542865627584.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Outpatientdata-1542969243754.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Train-1542865627584.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Beneficiarydata-1542969243754.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Test_Inpatientdata-1542969243754.csv
/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Outpatientdata-1542865627584.csv


In [4]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


print("Loading Data...")
train_beneficiary = pd.read_csv('/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Beneficiarydata-1542865627584.csv')
train_inpatient = pd.read_csv('/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Inpatientdata-1542865627584.csv')
train_labels = pd.read_csv('/kaggle/input/healthcare-provider-fraud-detection-analysis/Train-1542865627584.csv')

print(f"Beneficiaries: {train_beneficiary.shape}")
print(f"Inpatient Claims: {train_inpatient.shape}")
print(f"Labels (Fraud Providers): {train_labels.shape}")

print(" Data Loaded Successfully")

Loading Data...
Beneficiaries: (138556, 25)
Inpatient Claims: (40474, 30)
Labels (Fraud Providers): (5410, 2)
 Data Loaded Successfully


In [5]:
merged_data = pd.merge(train_inpatient, train_beneficiary, on='BeneID', how='inner')

full_data = pd.merge(merged_data, train_labels, on='Provider', how='inner')

full_data['ClaimStartDt'] = pd.to_datetime(full_data['ClaimStartDt'])
full_data['ClaimEndDt'] = pd.to_datetime(full_data['ClaimEndDt'])
full_data['Duration_Days'] = (full_data['ClaimEndDt'] - full_data['ClaimStartDt']).dt.days

full_data['Total_Cost'] = full_data['InscClaimAmtReimbursed'] + full_data['DeductibleAmtPaid']

full_data['DeductibleAmtPaid'] = full_data['DeductibleAmtPaid'].fillna(0)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
full_data['Is_Fraud'] = le.fit_transform(full_data['PotentialFraud'])

features = ['Duration_Days', 'Total_Cost', 'InscClaimAmtReimbursed', 'OPAnnualReimbursementAmt', 'IPAnnualReimbursementAmt', 'Age']

features = ['Duration_Days', 'Total_Cost', 'InscClaimAmtReimbursed', 'OPAnnualReimbursementAmt', 'IPAnnualReimbursementAmt']

X = full_data[features].fillna(0) 
y = full_data['Is_Fraud']        

print(f" Data Cleaned. Ready for training with {X.shape[0]} rows.")
print(X.head())

 Data Cleaned. Ready for training with 40474 rows.
   Duration_Days  Total_Cost  InscClaimAmtReimbursed  \
0              6     27068.0                   26000   
1              2      6068.0                    5000   
2              3      6068.0                    5000   
3              8      6068.0                    5000   
4             17     11068.0                   10000   

   OPAnnualReimbursementAmt  IPAnnualReimbursementAmt  
0                        60                     36000  
1                        60                     36000  
2                        60                     36000  
3                       250                      5000  
4                       120                     21260  


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)

print("Training the model (this might take a minute)...")
model.fit(X_train, y_train)

# 4. Evaluate
predictions = model.predict(X_test)
print(" Model Trained!")
print(f"Accuracy: {accuracy_score(y_test, predictions) * 100:.2f}%")
print("--- Detailed Report ---")
print(classification_report(y_test, predictions))

Training the model (this might take a minute)...
 Model Trained!
Accuracy: 53.13%
--- Detailed Report ---
              precision    recall  f1-score   support

           0       0.43      0.37      0.40      3391
           1       0.59      0.64      0.61      4704

    accuracy                           0.53      8095
   macro avg       0.51      0.51      0.51      8095
weighted avg       0.52      0.53      0.53      8095



In [8]:

user_uploaded_data = {
    'BeneID': ['BENE1001', 'BENE1002', 'BENE1003', 'BENE1004'],
    'ClaimID': ['CLM_001', 'CLM_002', 'CLM_003', 'CLM_004'],
    'ClaimStartDt': ['2023-01-01', '2023-05-01', '2023-06-01', '2023-02-10'],
    'ClaimEndDt':   ['2023-01-04', '2023-05-25', '2023-06-05', '2023-02-12'],
    'InscClaimAmtReimbursed': [5000, 120000, 3000, 55000],
    'DeductibleAmtPaid': [0, 1068, 0, 0],
    'OPAnnualReimbursementAmt': [2000, 5000, 1500, 10000],
    'IPAnnualReimbursementAmt': [1000, 80000, 500, 40000]
}
pd.DataFrame(user_uploaded_data).to_csv('new_patient_claims.csv', index=False)
print(" User file 'new_patient_claims.csv' received.")



def generate_fraud_report(csv_filename, trained_model, training_data_stats):
    """
    Reads a new CSV, runs the AI, and generates a reason for every flag.
    """
    print(f"Processing {csv_filename}...")
    df = pd.read_csv(csv_filename)
    
    df['ClaimStartDt'] = pd.to_datetime(df['ClaimStartDt'])
    df['ClaimEndDt'] = pd.to_datetime(df['ClaimEndDt'])
    df['Duration_Days'] = (df['ClaimEndDt'] - df['ClaimStartDt']).dt.days
    
    df['DeductibleAmtPaid'] = df['DeductibleAmtPaid'].fillna(0)
    df['Total_Cost'] = df['InscClaimAmtReimbursed'] + df['DeductibleAmtPaid']
    
    features = ['Duration_Days', 'Total_Cost', 'InscClaimAmtReimbursed', 'OPAnnualReimbursementAmt', 'IPAnnualReimbursementAmt']
    X_new = df[features].fillna(0)
    
    probs = trained_model.predict_proba(X_new)[:, 1] 
    
    avg_cost = training_data_stats['Total_Cost'].mean()
    avg_duration = training_data_stats['Duration_Days'].mean()
    
    results = []
    
    for i, risk_score in enumerate(probs):
        row = df.iloc[i]
        score_pct = round(risk_score * 100, 2)
        
        if score_pct > 70:
            status = "游댮 HIGH RISK"
        elif score_pct > 40:
            status = "游리 REVIEW"
        else:
            status = "游릭 SAFE"
            
        reasons = []
        if row['Total_Cost'] > avg_cost * 2:
            reasons.append(f"Cost (${row['Total_Cost']}) is >2x Avg.")
        if row['Duration_Days'] > avg_duration * 2:
            reasons.append(f"Stay ({row['Duration_Days']} days) is unusually long.")
        if row['IPAnnualReimbursementAmt'] > 50000:
             reasons.append("High Annual Reimbursement History.")
            
        if status == "游댮 HIGH RISK" and not reasons:
            reasons.append("Complex anomaly pattern detected by AI.")
            
        results.append({
            'Claim_ID': row['ClaimID'],
            'Risk_Score': score_pct,
            'Status': status,
            'Key_Reason': " | ".join(reasons) if reasons else "Normal Range"
        })
        
    return pd.DataFrame(results)


final_report = generate_fraud_report('new_patient_claims.csv', model, X)

print("\n---  FINAL FRAUD DETECTION REPORT ---")
display(final_report.sort_values(by='Risk_Score', ascending=False))

final_report.to_csv('Audit_Result.csv', index=False)
print("Report saved to 'Audit_Result.csv'")

 User file 'new_patient_claims.csv' received.
Processing new_patient_claims.csv...

---  FINAL FRAUD DETECTION REPORT ---


Unnamed: 0,Claim_ID,Risk_Score,Status,Key_Reason
1,CLM_002,65.0,游리 REVIEW,Cost ($121068) is >2x Avg. | Stay (24 days) is...
3,CLM_004,58.0,游리 REVIEW,Cost ($55000) is >2x Avg.
0,CLM_001,51.0,游리 REVIEW,Normal Range
2,CLM_003,46.0,游리 REVIEW,Normal Range


Report saved to 'Audit_Result.csv'


In [9]:
import joblib


joblib.dump(model, 'fraud_detection_model.pkl')

print(" Success! 'fraud_detection_model.pkl' has been created.")
print("You can now download this file from the 'Output' section of your notebook.")



print("\nTesting the saved file...")

loaded_model = joblib.load('fraud_detection_model.pkl')

test_claim = [[5, 5000, 4000, 2000, 1000]]
prediction = loaded_model.predict(test_claim)
probability = loaded_model.predict_proba(test_claim)[0][1]

print(f"Test Prediction: {'Fraud' if prediction[0]==1 else 'Safe'}")
print(f"Confidence Score: {probability:.4f}")

 Success! 'fraud_detection_model.pkl' has been created.
You can now download this file from the 'Output' section of your notebook.

Testing the saved file...
Test Prediction: Fraud
Confidence Score: 0.6800




In [10]:
import pandas as pd
import joblib

loaded_model = joblib.load('fraud_detection_model.pkl')

test_data = {
    'Duration_Days': [5], 
    'Total_Cost': [5000], 
    'InscClaimAmtReimbursed': [4000], 
    'OPAnnualReimbursementAmt': [2000], 
    'IPAnnualReimbursementAmt': [1000]
}

test_df = pd.DataFrame(test_data)

prediction = loaded_model.predict(test_df)
probability = loaded_model.predict_proba(test_df)[0][1]

print(f"Test Prediction: {'Fraud' if prediction[0]==1 else 'Safe'}")
print(f"Confidence Score: {probability:.4f}")
print(" No warnings! System is stable.")

Test Prediction: Fraud
Confidence Score: 0.6800


In [13]:
import joblib


joblib.dump(model, 'Welfare Delivery.pkl')

print(" Success! 'fraud_detection_model.pkl' has been created.")
print("You can now download this file from the 'Output' section of your notebook.")



print("\nTesting the saved file...")

loaded_model = joblib.load('fraud_detection_model.pkl')

test_claim = [[5, 5000, 4000, 2000, 1000]]
prediction = loaded_model.predict(test_claim)
probability = loaded_model.predict_proba(test_claim)[0][1]

print(f"Test Prediction: {'Fraud' if prediction[0]==1 else 'Safe'}")
print(f"Confidence Score: {probability:.4f}")

 Success! 'fraud_detection_model.pkl' has been created.
You can now download this file from the 'Output' section of your notebook.

Testing the saved file...
Test Prediction: Fraud
Confidence Score: 0.6800


