In [94]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from ctgan import CTGAN

In [15]:
df = pd.read_csv("hospital.csv")

In [16]:
df.head()

Unnamed: 0,Country Name,Hospital Name,Total No. of Patients,Cardiovascular,Cancer,Endocrinology and metabolic diseases,Neurological diseases,Mental health and behavioral disorders,Infectious diseases,Dermatology,Gastroenterology,Hematology and coagulation,Renal disease,Rheumatology,Urologic
0,Argentina,The Hospital Italiano de Buenos Aires,41324,3955,4642,4611,3231,3631,1587,3207,3386,4286,3710,1202,3876
1,Argentina,Hospital Alemán,32542,1256,1981,2315,3531,3997,4566,2423,1024,3113,2278,3111,2947
2,Australia,Royal Children's Hospital,30337,2496,3527,2006,2386,2929,2540,3527,3847,1095,3140,1409,1435
3,Australia,Baker Heart Research Institute,33212,2855,4299,3309,1444,2085,4740,2918,1283,3031,1235,2074,3939
4,Australia,Department Health & Human Services of Tasmania...,45093,4932,4931,3526,2737,1285,4733,4811,4631,2825,3319,4580,2783


In [92]:
df1 = df.drop(columns=["Hospital Name"])
df2 = df.drop(columns=["Hospital Name"])

In [18]:
encoder = LabelEncoder()

In [19]:
df1['Country Name'] = encoder.fit_transform(df1['Country Name'])

In [20]:
# Normalization
scaler = MinMaxScaler()
numerical_cols = df1.columns[1:]
df1[numerical_cols] = scaler.fit_transform(df1[numerical_cols])

In [21]:
df1.head(5)

Unnamed: 0,Country Name,Total No. of Patients,Cardiovascular,Cancer,Endocrinology and metabolic diseases,Neurological diseases,Mental health and behavioral disorders,Infectious diseases,Dermatology,Gastroenterology,Hematology and coagulation,Renal disease,Rheumatology,Urologic
0,0,0.668721,0.725066,0.912446,0.99514,0.583883,0.642285,0.118889,0.535431,0.594214,0.818415,0.734683,0.031096,0.74121
1,0,0.284909,0.012929,0.235174,0.338765,0.666392,0.739989,0.914774,0.328133,0.0,0.517569,0.337677,0.525784,0.501034
2,1,0.188541,0.340106,0.628659,0.250429,0.351485,0.454885,0.373497,0.620042,0.710189,0.0,0.576657,0.084737,0.110134
3,1,0.314191,0.434828,0.825146,0.622927,0.092409,0.229578,0.961261,0.459016,0.065157,0.496538,0.048517,0.257061,0.757497
4,1,0.833443,0.98285,0.986002,0.684963,0.44802,0.016017,0.959391,0.959545,0.907421,0.443704,0.626282,0.906452,0.458635


In [22]:
categorical_columns = ['Country Name']
categorical_columns

['Country Name']

In [68]:
# Initialize Model
ctgan = CTGAN(epochs=1000, batch_size=12, pac=1, generator_lr=1e-4, discriminator_lr=1e-4, discriminator_steps=3)

In [69]:
# Train the CTGAN model
ctgan.fit(df1, categorical_columns)

In [70]:
# Generate synthetic data
synthetic_data = ctgan.sample(200)

In [71]:
synthetic_data.to_csv("synthetic_hospital_data.csv", index=False)
print(synthetic_data.head())

   Country Name  Total No. of Patients  Cardiovascular    Cancer  \
0            11              -0.019284        0.269432 -0.007986   
1             2               0.473216        0.542322  0.244688   
2            10               0.842837        0.683745  0.649671   
3            14               0.708590        0.673086  0.753990   
4             6              -0.067204       -0.125771 -0.022520   

   Endocrinology and metabolic diseases  Neurological diseases  \
0                              0.053290               0.548132   
1                              0.360520               0.192131   
2                              0.902380               0.179518   
3                              1.017587               0.510644   
4                              0.138922               0.145740   

   Mental health and behavioral disorders  Infectious diseases  Dermatology  \
0                                0.236452             0.483050     0.025951   
1                                0.5

## Decoding the Synthetic Data

In [95]:
# Load the encoded synthetic data
encoded_data = pd.read_csv("synthetic_hospital_data.csv")

# Load the original dataset to retrieve encoder & scaler info
original_data = df2

# 1️⃣ Reverse Label Encoding for "Country Name"
encoder = LabelEncoder()
encoder.fit(original_data["Country Name"])  # Fit on original data
encoded_data["Country Name"] = encoder.inverse_transform(encoded_data["Country Name"].astype(int))

# 2️⃣ Reverse Min-Max Scaling
scaler = MinMaxScaler()
numerical_cols = encoded_data.columns.difference(["Country Name"])  # Exclude categorical
scaler.fit(original_data[numerical_cols])  # Fit scaler on original numerical data
encoded_data[numerical_cols] = scaler.inverse_transform(encoded_data[numerical_cols])

# Save the decoded synthetic dataset
encoded_data.to_csv("decoded_synthetic_data.csv", index=False)
print("✅ Decoding successful! Decoded synthetic data saved.")

# Display first few rows
print(encoded_data.head())

✅ Decoding successful! Decoded synthetic data saved.
   Country Name  Total No. of Patients  Cardiovascular       Cancer  \
0        Mexico           25581.756428     2228.146506  1025.621410   
1    Bangladesh           36850.661533     3262.398532  2018.379894   
2         Japan           45307.963945     3798.394458  3609.557658   
3  South Africa           42236.257227     3757.995414  4019.425331   
4       Germany           24485.301636      730.328449   968.517095   

   Endocrinology and metabolic diseases  Neurological diseases  \
0                           1316.410097            3101.008773   
1                           2391.098744            1806.588229   
2                           4286.525908            1760.727797   
3                           4689.518753            2964.702209   
4                           1615.949014            1637.909327   

   Mental health and behavioral disorders  Infectious diseases  Dermatology  \
0                             2110.747951   