In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 31)

# Generate random dates and times (not evenly distributed, no midnight hours)
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]
times = [random.choice([
    f"{hour}:{minute:02d}"
    for hour in range(6, 22)  # 6 AM to 10 PM
    for minute in range(0, 60, random.choice([10, 15, 30, 45]))  # More varied intervals
]) for _ in range(rows)]

# Combine dates and times into a DataFrame
data = pd.DataFrame({"Date": dates, "Time": times})
data["Datetime"] = pd.to_datetime(data["Date"].astype(str) + " " + data["Time"])
data = data.sort_values(by="Datetime").reset_index(drop=True)

# Generate step counts based on time intervals
step_counts = []
previous_time = None

for current_time in data["Datetime"]:
    if previous_time:
        time_diff = (current_time - previous_time).seconds / 60  # Time difference in minutes

        # Assign realistic step counts based on time difference
        if time_diff <= 15:
            steps = random.randint(0, 200)  # Small steps for short intervals
        elif time_diff <= 30:
            steps = random.randint(50, 500)  # Moderate activity
        else:
            steps = random.randint(200, 1500)  # Longer duration, potentially high steps
    else:
        steps = random.randint(0, 100)  # Initial step count

    step_counts.append(steps)
    previous_time = current_time

data["Step_Counts"] = step_counts

# Save to CSV (optional)
data.to_csv("step_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Data generated successfully")

        Date   Time            Datetime  Step_Counts
0 2024-01-01   7:50 2024-01-01 07:50:00           83
1 2024-01-01  13:00 2024-01-01 13:00:00         1384
2 2024-01-01  17:00 2024-01-01 17:00:00         1382
3 2024-01-01  19:00 2024-01-01 19:00:00          666
4 2024-01-02  11:00 2024-01-02 11:00:00          305
INFO: Data generated successfully


### Heart Rate

In [2]:
# Set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 31)

# Generate random dates and times (not evenly distributed, no midnight hours)
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]
times = [random.choice([
    f"{hour}:{minute:02d}"
    for hour in range(6, 22)  # 6 AM to 10 PM
    for minute in range(0, 60, random.choice([10, 15, 30, 45]))  # Random intervals
]) for _ in range(rows)]

# Combine dates and times into a DataFrame
data = pd.DataFrame({"Date": dates, "Time": times})
data["Datetime"] = pd.to_datetime(data["Date"].astype(str) + " " + data["Time"])
data = data.sort_values(by="Datetime").reset_index(drop=True)

# Generate heart rate (HR) data based on realistic patterns
heart_rates = []
previous_time = None

for current_time in data["Datetime"]:
    if previous_time:
        time_diff = (current_time - previous_time).seconds / 60  # Time difference in minutes

        # Heart rate logic
        if time_diff <= 15:
            hr = random.randint(60, 80)  # Resting
        elif time_diff <= 30:
            hr = random.randint(80, 100)  # Light activity
        else:
            hr = random.randint(100, 140)  # Moderate activity
    else:
        hr = random.randint(60, 100)  # Initial HR (resting or light activity)

    heart_rates.append(hr)
    previous_time = current_time

data["Heart_Rate"] = heart_rates

# Drop the 'Datetime' column if unnecessary
data = data.drop(columns=["Datetime"])

# Save to CSV (optional)
data.to_csv("hr_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: HR data generated successfully")

        Date   Time  Heart_Rate
0 2024-01-01   7:50          97
1 2024-01-01  13:00         136
2 2024-01-01  17:00         114
3 2024-01-01  19:00         103
4 2024-01-02  11:00         137
INFO: HR data generated successfully


### Sleep Data

In [3]:
# Set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 1, 31)

# Generate random dates
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]

# Generate sleep and wake-up times
sleep_start_times = []
wake_up_times = []
total_sleep_durations = []

for date in dates:
    # Randomize sleep start time (between 8 PM to 1 AM)
    sleep_start_hour = random.randint(20, 24) % 24  # Ensure wrap-around for PM times
    sleep_start_minute = random.choice([0, 15, 30, 45])
    sleep_start_time = datetime.combine(date, datetime.min.time()) + timedelta(hours=sleep_start_hour, minutes=sleep_start_minute)

    # Randomize wake-up time (between 5 AM to 10 AM the next day)
    wake_up_hour = random.randint(5, 10)
    wake_up_minute = random.choice([0, 15, 30, 45])
    wake_up_time = datetime.combine(date + timedelta(days=1), datetime.min.time()) + timedelta(hours=wake_up_hour, minutes=wake_up_minute)

    # Calculate total sleep duration (in hours)
    total_sleep_duration = round((wake_up_time - sleep_start_time).total_seconds() / 3600, 2)

    sleep_start_times.append(sleep_start_time.time())
    wake_up_times.append(wake_up_time.time())
    total_sleep_durations.append(total_sleep_duration)

# Create the DataFrame
data = pd.DataFrame({
    "Date": dates,
    "Sleep_Start_Time": sleep_start_times,
    "Wake_Up_Time": wake_up_times,
    "Total_Sleep_Hours": total_sleep_durations
})

# Save to CSV (optional)
data.to_csv("sleep_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Sleep data generated successfully")

        Date Sleep_Start_Time Wake_Up_Time  Total_Sleep_Hours
0 2024-01-21         21:30:00     10:00:00              12.50
1 2024-01-04         00:15:00     09:15:00              33.00
2 2024-01-01         21:45:00     08:30:00              10.75
3 2024-01-24         00:15:00     10:30:00              34.25
4 2024-01-09         20:15:00     05:30:00               9.25
INFO: Sleep data generated successfully


### Blood Data

In [4]:
# Set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate random dates for blood test dates
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]

# Generate random times for blood tests (between 7 AM and 5 PM)
times = [random.choice([f"{hour}:{minute:02d}" for hour in range(7, 18) for minute in range(0, 60, random.choice([15, 30]))]) for _ in range(rows)]

# Combine the date and time into a datetime column
data = pd.DataFrame({"Date": dates, "Time": times})
data["Datetime"] = pd.to_datetime(data["Date"].astype(str) + " " + data["Time"])

# Test Results (randomly generated within realistic ranges)
wbc_values = np.random.normal(6.0, 1.5, rows)  # WBC normal range: 4.5 to 11 (in 10^9/L)
rbc_values = np.random.normal(5.0, 0.5, rows)  # RBC normal range: 4.2 to 5.9 (in 10^12/L)
hemoglobin_values = np.random.normal(13.5, 1.5, rows)  # Hemoglobin normal range: 12 to 18 (g/dL)
platelet_values = np.random.normal(250, 50, rows)  # Platelets normal range: 150-450 (10^9/L)
glucose_values = np.random.normal(90, 15, rows)  # Glucose normal range: 70-100 (mg/dL)
cholesterol_values = np.random.normal(200, 30, rows)  # Cholesterol normal range: 140-220 (mg/dL)
creatinine_values = np.random.normal(1.0, 0.2, rows)  # Creatinine normal range: 0.6-1.2 (mg/dL)

# Assigning realistic test types
test_types = random.choices(["CBC", "Lipid Panel", "Glucose Test", "Kidney Function Test", "Complete Blood Test"], k=rows)

# Patient ID (randomly generated integers)
patient_ids = [f"P{random.randint(1000, 9999)}" for _ in range(rows)]

# Combine the test results into the dataframe
data["WBC"] = np.clip(wbc_values, 4.5, 11)  # Clipping to normal range
data["RBC"] = np.clip(rbc_values, 4.2, 5.9)
data["Hemoglobin"] = np.clip(hemoglobin_values, 12, 18)
data["Platelets"] = np.clip(platelet_values, 150, 450)
data["Glucose"] = np.clip(glucose_values, 70, 100)
data["Cholesterol"] = np.clip(cholesterol_values, 140, 220)
data["Creatinine"] = np.clip(creatinine_values, 0.6, 1.2)
data["Test_Type"] = test_types
#data["Patient_ID"] = patient_ids

# Drop the 'Datetime' column if unnecessary
data = data.drop(columns=["Datetime"])

# Save to CSV (optional)
data.to_csv("blood_test_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Blood test data generated successfully")

        Date   Time       WBC       RBC  Hemoglobin   Platelets    Glucose  \
0 2024-11-23  14:45  6.745071  4.292315   14.036681  208.550249  70.000000   
1 2024-02-27  17:45  5.792604  4.789677   14.341177  221.990948  81.009375   
2 2024-01-13   7:00  6.971533  4.828643   15.124577  287.364680  90.078655   
3 2024-05-20   9:00  8.284545  4.598861   15.080703  280.518513  90.704709   
4 2024-05-05  12:45  5.648770  4.919357   12.000000  248.954920  83.249018   

   Cholesterol  Creatinine             Test_Type  
0   220.000000    1.151398  Kidney Function Test  
1   220.000000    0.815567          Glucose Test  
2   158.042973    1.173921          Glucose Test  
3   216.889077    1.200000          Glucose Test  
4   180.480723    1.082687  Kidney Function Test  
INFO: Blood test data generated successfully


### Symptoms/mood Data

In [5]:
# Set the seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate random dates for symptom/mood records
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]

# Generate random times (between 7 AM and 10 PM)
times = [random.choice([f"{hour}:{minute:02d}" for hour in range(7, 23) for minute in range(0, 60, random.choice([15, 30]))]) for _ in range(rows)]

# Combine the date and time into a datetime column
data = pd.DataFrame({"Date": dates, "Time": times})
data["Datetime"] = pd.to_datetime(data["Date"].astype(str) + " " + data["Time"])

# Possible symptoms
symptoms_list = ["Headache", "Fatigue", "Anxiety", "Nausea", "Dizziness", "Muscle pain", "Stress", "Insomnia"]
moods_list = ["Very Poor", "Poor", "Neutral", "Good", "Very Good"]

# Generate mood ratings (1–10)
mood_ratings = [random.randint(1, 10) for _ in range(rows)]

# Assign random symptoms and severity levels
symptoms = [random.choice(symptoms_list) for _ in range(rows)]
severity = [random.randint(1, 5) for _ in range(rows)]  # Severity 1-5 (mild to severe)

# Energy levels (1–10, reflecting energy or fatigue)
energy_levels = [random.randint(1, 10) for _ in range(rows)]

# Combine the symptom and mood data into the dataframe
data["Mood"] = [moods_list[random.randint(0, 4)] for _ in range(rows)]
data["Symptom"] = symptoms


# Drop the 'Datetime' column if unnecessary
data = data.drop(columns=["Datetime"])

# Save to CSV (optional)
data.to_csv("symptoms_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Symptoms and mood data generated successfully")

        Date   Time       Mood    Symptom
0 2024-11-23  11:30  Very Good    Anxiety
1 2024-02-27  19:00       Good  Dizziness
2 2024-01-13  21:00       Poor   Headache
3 2024-05-20  21:00  Very Good   Headache
4 2024-05-05  11:30  Very Poor  Dizziness
INFO: Symptoms and mood data generated successfully


### Cigarettes Data

In [6]:
# Set the seed for reproducibility
np.random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

# Generate random dates for cigarette consumption records
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]

# Number of cigarettes smoked (between 1 to 20 per day)
num_cigarettes = [random.randint(1, 20) for _ in range(rows)]

# Create the dataset
data = pd.DataFrame({
    "Date": dates,
    "Number of Cigarettes": num_cigarettes
})

# Save to CSV (optional)
data.to_csv("cigarette_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Cigarette consumption data generated successfully")

        Date  Number of Cigarettes
0 2024-10-15                     5
1 2024-05-03                     3
2 2024-10-17                     6
3 2024-07-18                    14
4 2024-12-12                     9
INFO: Cigarette consumption data generated successfully


### Calories Data

In [7]:
# Set the seed for reproducibility
np.random.seed(42)

# Parameters for the dataset
rows = 100
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)

# Generate random dates for calorie consumption records
dates = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days)) for _ in range(rows)]

# Meal types (Breakfast, Lunch, Dinner, Snack)
meal_types = ['Breakfast', 'Lunch', 'Dinner', 'Snack']
meal_type = [random.choice(meal_types) for _ in range(rows)]

# Calories consumed (between 200 and 1200)
calories_consumed = [random.randint(200, 1200) for _ in range(rows)]

# Exercise activity (Yes/No)
exercise = [random.choice(['Yes', 'No']) for _ in range(rows)]

# Calories burned through exercise (between 100 and 800 if exercise is "Yes")
calories_burned = [random.randint(100, 800) if ex == 'Yes' else 0 for ex in exercise]

# Create the dataset
data = pd.DataFrame({
    "Date": dates,
    "Meal Type": meal_type,
    "Calories Consumed": calories_consumed,
    "Exercise": exercise,
    "Calories Burned": calories_burned
})

# Save to CSV (optional)
data.to_csv("calorie_db.csv", index=False)

# Display the first few rows
print(data.head())

# Success message
print("INFO: Calorie consumption data generated successfully")

        Date  Meal Type  Calories Consumed Exercise  Calories Burned
0 2023-04-24      Snack                928      Yes              533
1 2023-09-19  Breakfast                529       No                0
2 2023-09-23     Dinner               1048      Yes              511
3 2023-10-13     Dinner                453       No                0
4 2023-10-25      Snack                979       No                0
INFO: Calorie consumption data generated successfully
