In [1]:
import pandas as pd
import numpy as np
import pathlib

In [4]:
# Setting seed for reproducibility
np.random.seed(42)

# Number of observations
n = 250000

# Generating the data

# Generating the data
data = {
    'Age': np.random.randint(25, 80, size=n),  # Age between 40 and 80
    'Sex': np.random.choice(['Male', 'Female'], size=n, p=[0.48, 0.52]),  # Sex with slight bias
    'Diabetes diagnosed by a doctor': np.random.binomial(1, 0.1, size=n),  # 10% prevalence
    'Blood pressure medication': np.random.binomial(1, 0.2, size=n),  # 20% prevalence
    'Smoking status': np.random.choice(['Never', 'Former', 'Current'], size=n, p=[0.6, 0.3, 0.1]),  # Smoking status distribution
    'Total cholesterol': np.random.normal(200, 30, size=n),  # Normal distribution with mean 200 and std 30
    'HDL cholesterol': np.random.normal(50, 10, size=n),  # Normal distribution with mean 50 and std 10
    'LDL cholesterol': np.random.normal(100, 20, size=n),  # Normal distribution with mean 100 and std 20
    'Triglycerides': np.random.normal(150, 50, size=n),  # Normal distribution with mean 150 and std 50
    'Systolic blood pressure': np.random.normal(120, 15, size=n),  # Normal distribution with mean 120 and std 15
    'Diastolic blood pressure': np.random.normal(80, 10, size=n),  # Normal distribution with mean 80 and std 10
    'Standing height': np.random.normal(170, 10, size=n),  # Normal distribution with mean 170 and std 10
    'Weight': np.random.normal(70, 15, size=n),  # Normal distribution with mean 70 and std 15
    'Date of attending assessment center': pd.to_datetime(np.random.randint(
        np.datetime64('2000-01-01').astype('int'),
        np.datetime64('2020-12-31').astype('int'),
        size=n), unit='D'),  # Random dates between 2000 and 2020
    'Date of death': pd.to_datetime(
        np.random.choice(
            np.append(
                np.random.randint(
                    np.datetime64('2000-01-01').astype('int'),
                    np.datetime64('2020-12-31').astype('int'),
                    size=int(n * 0.05)  # Assuming 5% mortality rate
                ),
                [np.datetime64('NaT').astype('int')] * int(n * 0.95)  # 95% NaT for alive individuals
            ),
            size=n
        ),
        unit='D'
    ),
    'Myocardial infarction (ICD9)': np.random.binomial(1, 0.02, size=n),  # 2% prevalence
    'Myocardial infarction (ICD10)': np.random.binomial(1, 0.02, size=n),  # 2% prevalence
    'Ischemic stroke (ICD9)': np.random.binomial(1, 0.015, size=n),  # 1.5% prevalence
    'Ischemic stroke (ICD10)': np.random.binomial(1, 0.015, size=n),  # 1.5% prevalence
    'Self Reported Myocardial Infarction': np.random.binomial(1, 0.03, size=n),  # 3% prevalence
    'Self Reported Ischemic Stroke': np.random.binomial(1, 0.02, size=n)  # 2% prevalence
}

# Create DataFrame
df = pd.DataFrame(data)

# Displaying first few rows of the DataFrame
print(df.head())


   Age     Sex  Diabetes diagnosed by a doctor  Blood pressure medication  \
0   78  Female                               0                          0   
1   68  Female                               0                          0   
2   54    Male                               0                          0   
3   47  Female                               0                          1   
4   60    Male                               1                          0   

  Smoking status  Total cholesterol  HDL cholesterol  LDL cholesterol  \
0          Never         236.446135        53.876010       112.781351   
1          Never         177.072596        56.079951       103.387032   
2          Never         210.375481        30.700744       125.180826   
3         Former         210.120334        48.690967       144.352386   
4          Never         176.637349        45.456382        53.037083   

   Triglycerides  Systolic blood pressure  ...  Standing height     Weight  \
0     168.200203    

In [6]:
df.to_csv(r'C:\Users\giorg\projects_local\GenAiSleep\Q&A-and-RAG-with-SQL-and-TabularData\data\csv_xlsx\ukkb.csv', index=False)