In [1]:
import pandas as pd
import numpy as np

# Generate 50,000 rows of data
n_rows = 50000

# Generate random service dates
service_dates = pd.to_datetime(np.random.choice(
    pd.date_range(start="2022-01-01", end="2024-11-30").date,
    size=n_rows
))

# Generate submission dates after service dates
submission_delays = np.random.randint(1, 30, size=n_rows)  # Delay between 1 and 30 days
submission_dates = service_dates + pd.to_timedelta(submission_delays, unit="d")

# Ensure submission dates stay within range
submission_dates = pd.Series(submission_dates).apply(
    lambda x: min(x, pd.Timestamp("2024-12-20"))
)

# Generate processed dates after submission dates
processing_delays = np.random.randint(1, 30, size=n_rows)  # Delay between 1 and 30 days
processed_dates = submission_dates + pd.to_timedelta(processing_delays, unit="d")

# Ensure processed dates stay within range
processed_dates = pd.Series(processed_dates).apply(
    lambda x: min(x, pd.Timestamp("2024-12-31"))
)

# Create the DataFrame
dates_df = pd.DataFrame({
    "ServiceDate": service_dates,
    "SubmissionDate": submission_dates,
    "ProcessedDate": processed_dates
})

# Save to CSV
dates_df.to_csv("Service_Submission_Processed_Dates.csv", index=False)
print("Dataset saved as Service_Submission_Processed_Dates.csv")


Dataset saved as Service_Submission_Processed_Dates.csv


In [2]:
claims_df=pd.read_csv("C:/Users/kandu/Documents/Data Analysis Projects/Claims Denial Prediction/data/raw_new/claims.csv")

In [3]:
import pandas as pd

# Assuming claims_df is the DataFrame containing the claims table data
# Ensure ProcessedDate is a datetime object
claims_df["ProcessedDate"] = pd.to_datetime(claims_df["ProcessedDate"])

# Extract the month in "yyyy-mm" format
claims_df["MonthID"] = claims_df["ProcessedDate"].dt.to_period("M").astype(str)

# Aggregate data to create the revenue table
revenue_table = claims_df.groupby("MonthID").agg(
    TotalRevenue=pd.NamedAgg(column="Reimbursement", aggfunc="sum"),
    ClaimsProcessed=pd.NamedAgg(column="ClaimID", aggfunc="count"),
    PatientsServed=pd.NamedAgg(column="PatientID", aggfunc="nunique")
).reset_index()

# Display the first few rows of the revenue table
print(revenue_table)

# Save to CSV if needed
revenue_table.to_csv("Revenue_Table.csv", index=False)


    MonthID  TotalRevenue  ClaimsProcessed  PatientsServed
0   2022-01     777478.64              240             239
1   2022-02    3840660.15             1097            1045
2   2022-03    4812872.62             1471            1361
3   2022-04    5000918.17             1485            1371
4   2022-05    5278201.83             1556            1444
5   2022-06    4416068.10             1330            1247
6   2022-07    4734093.82             1386            1286
7   2022-08    5269374.17             1502            1388
8   2022-09    4889762.17             1417            1335
9   2022-10    4842180.43             1409            1320
10  2022-11    4971925.30             1421            1341
11  2022-12    4883078.49             1411            1305
12  2023-01    5191809.25             1512            1410
13  2023-02    4463510.68             1302            1220
14  2023-03    4920492.15             1431            1341
15  2023-04    4782512.08             1429            13

In [6]:
import pandas as pd
import random

def random_choice(options, probs):
    return random.choices(options, probs, k=1)[0]

claims_df2 = pd.DataFrame({
    "Denied_Status": [random_choice(["Yes","No"],[0.25,0.75]) for _ in range(50000)]
})

In [9]:
claims_df2.to_csv("Claim_Denials.csv",index=False,header=True)

In [11]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Parameters for the dataset
num_claims = 50000
denial_ratio = 0.25
approval_ratio = 0.75

# Calculate number of denied and approved claims
num_denied = int(num_claims * denial_ratio)
num_approved = num_claims - num_denied

# Generate Claim Amounts in range $100 - $10000
claim_amounts = np.random.uniform(100, 10000, num_claims).round(2)

# Generate Denial Status
# 1 for Denied (Yes), 0 for Approved (No)
denial_status = np.array([1] * num_denied + [0] * num_approved)
np.random.shuffle(denial_status)

# Generate Reimbursement Amounts
reimbursement = []
for claim, denied in zip(claim_amounts, denial_status):
    if denied == 1:
        reimbursement.append(0)  # Denied claims have $0 reimbursement
    else:
        reimbursement.append(round(np.random.uniform(100, claim), 2))  # Approved claims have reimbursement < claim amount

# Create the DataFrame
data = {
    "ClaimAmount": claim_amounts,
    "Reimbursement": reimbursement,
    "DenialStatus": ["Yes" if d == 1 else "No" for d in denial_status]
}
claims_df = pd.DataFrame(data)

# Display the first few rows
print(claims_df.head())

# Save to a CSV file
claims_df.to_csv("claims1_dataset.csv", index=False)
print("Dataset saved as 'claims_dataset.csv'")


   ClaimAmount  Reimbursement DenialStatus
0      3807.95         962.26           No
1      9512.07        7864.08           No
2      7346.74           0.00          Yes
3      6026.72        5545.49           No
4      1644.58         282.53           No
Dataset saved as 'claims_dataset.csv'
