In [4]:
import pandas as pd
import random
!pip install faker
from faker import Faker

# Initialize Faker for generating fake names and dates
fake = Faker()

# Possible values for each column
genders = ["Male", "Female"]
blood_types = ["A+", "A-", "B+", "B-", "O+", "O-", "AB+", "AB-"]
medical_conditions = ["Hypertension", "Diabetes", "Asthma", "COPD", "Heart Disease", 
                      "Kidney Disease", "Liver Disease", "Anemia", "Cancer", "Healthy"]

# Generate dataset
num_samples = 300
data = {
    "Name": [fake.name() for _ in range(num_samples)],
    "Age": [random.randint(18, 90) for _ in range(num_samples)],
    "Gender": [random.choice(genders) for _ in range(num_samples)],
    "Blood Type": [random.choice(blood_types) for _ in range(num_samples)],
    "Medical Condition": [random.choice(medical_conditions) for _ in range(num_samples)],
    "Date of Admission": [fake.date_between(start_date="-2y", end_date="today") for _ in range(num_samples)],
}

# Create DataFrame and save as CSV
df = pd.DataFrame(data)
df.to_csv("patient_data.csv", index=False)

print("Dataset saved as 'patient_data.csv'")

Collecting faker
  Downloading Faker-36.1.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.1-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m294.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.1
Dataset saved as 'patient_data.csv'


In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 300

# Generate synthetic data
age = np.random.randint(18, 90, n_samples)  # Age between 18 and 90
sex = np.random.choice([0, 1], n_samples)  # 0 = female, 1 = male
bmi = np.random.normal(25, 5, n_samples)  # BMI with mean 25 and std deviation 5
systolic_bp = np.random.randint(90, 180, n_samples)  # Systolic BP between 90 and 180
diabetes = np.random.choice([0, 1], n_samples)  # 0 = no, 1 = yes
heart_disease = np.random.choice([0, 1], n_samples)  # 0 = no, 1 = yes
copd = np.random.choice([0, 1], n_samples)  # 0 = no, 1 = yes
oxygen_saturation = np.random.normal(95, 2, n_samples)  # Oxygen saturation around 95% with some noise

# Target variable: Pulmonary Hypertension diagnosis (binary: 0 = no, 1 = yes)
ph_diagnosis = np.random.choice([0, 1], n_samples, p=[0.7, 0.3])  # 30% chance of having PH

# Adjust the distribution for patients with PH (make them more likely to have certain features)
ph_diagnosis[age > 65] = 1  # Older patients have higher likelihood of PH
ph_diagnosis[oxygen_saturation < 90] = 1  # Low oxygen saturation linked to PH
ph_diagnosis[heart_disease == 1] = 1  # Heart disease can be a comorbidity

# Create a DataFrame
data = pd.DataFrame({
    'Age': age,
    'Sex': sex,
    'BMI': bmi,
    'Systolic BP': systolic_bp,
    'Diabetes': diabetes,
    'Heart Disease': heart_disease,
    'COPD': copd,
    'Oxygen Saturation': oxygen_saturation,
    'PH Diagnosis': ph_diagnosis
})

# Save the dataset as a CSV file
data.to_csv('synthetic_ph_dataset_300.csv', index=False)

print("Dataset with 300 rows saved as 'synthetic_ph_dataset_300.csv'")


Dataset with 300 rows saved as 'synthetic_ph_dataset_300.csv'
