In [37]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd

In [39]:
slp_score_data = pd.read_csv("../EDA/slp_score_data.csv")
sleep_data = pd.read_csv("../EDA/sleep_data.csv")
wellness_data = pd.read_csv("../EDA/wellness_data.csv")

processed_summary= pd.read_csv("../EDA/sleep_processed_summary.csv")
processed_data= pd.read_csv("../EDA/sleep_processed_data.csv")
processed_shortdata= pd.read_csv("../EDA/sleep_processed_shortdata.csv")

sleepDay = pd.read_csv("../LLM-testing/sleepDay_merged.csv")
dailyActivity = pd.read_csv("../LLM-testing/dailyActivity_merged.csv")

In [41]:
def synthesiser(data, id_col, participant_col, start_participant_num, num_new_participants):
    """
    Generate synthetic data for new participants using CTGAN.

    Parameters:
    - data (pd.DataFrame): Original dataset.
    - id_col (str): Column name to be treated as unique row ID (e.g., 'row_id').
    - participant_col (str): Column name representing participant IDs.
    - start_participant_num (int): Number to start naming new participants from (e.g., 17 for 'p17').
    - num_new_participants (int): How many new participants to synthesize.

    Returns:
    - pd.DataFrame: Synthetic data for the new participants.
    """
    
    # Step 0: Assign unique row ID if not already
    if id_col not in data.columns:
        data[id_col] = range(len(data))
    
    # Step 1: Setup metadata
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data)
    metadata.update_column(column_name=id_col, sdtype='id')
    metadata.set_primary_key(column_name=id_col)
    metadata.update_column(column_name=participant_col, sdtype='categorical')

    # Step 2: Fit the synthesizer
    synthesizer = CTGANSynthesizer(metadata)
    synthesizer.fit(data)

    # Step 3: Estimate rows per participant and sample synthetic data
    original_participant_count = data[participant_col].nunique()
    rows_per_participant = len(data) // original_participant_count
    synthetic_sample_size = rows_per_participant * num_new_participants
    synthetic_data = synthesizer.sample(synthetic_sample_size)

    # Step 4: Generate new participant IDs
    new_ids = [f"{i}" for i in range(start_participant_num, start_participant_num + num_new_participants)]

    # Step 5: Build new synthetic dataset
    new_participant_data = pd.DataFrame()

    for new_id in new_ids:
        subset = synthetic_data.sample(rows_per_participant, replace=True).copy()
        subset[participant_col] = new_id
        new_participant_data = pd.concat([new_participant_data, subset], ignore_index=True)

    # Step 6: Reassign row IDs
    new_participant_data[id_col] = range(1, len(new_participant_data) + 1)

    return new_participant_data

In [43]:
sleepDay.describe()

Unnamed: 0,Id,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
count,413.0,413.0,413.0,413.0
mean,5000979000.0,1.118644,419.467312,458.639225
std,2060360000.0,0.345521,118.344679,127.101607
min,1503960000.0,1.0,58.0,61.0
25%,3977334000.0,1.0,361.0,403.0
50%,4702922000.0,1.0,433.0,463.0
75%,6962181000.0,1.0,490.0,526.0
max,8792010000.0,3.0,796.0,961.0


In [45]:
synthetic_sleepDay = synthesiser(
    data=sleepDay,
    id_col='row_id',
    participant_col='Id',
    start_participant_num=34,
    num_new_participants=66
)
sleepDay_final = pd.concat([sleepDay, synthetic_sleepDay], ignore_index=True)
sleepDay_final.describe()
sleepDay_final.to_csv("GAN_generated_data/KaggleDataset/sleepDay_synthesised.csv", index=False)



In [47]:
dailyActivity.describe()

Unnamed: 0,Id,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
count,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0,940.0
mean,4855407000.0,7637.910638,5.489702,5.475351,0.108171,1.502681,0.567543,3.340819,0.001606,21.164894,13.564894,192.812766,991.210638,2303.609574
std,2424805000.0,5087.150742,3.924606,3.907276,0.619897,2.658941,0.88358,2.040655,0.007346,32.844803,19.987404,109.1747,301.267437,718.166862
min,1503960000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2320127000.0,3789.75,2.62,2.62,0.0,0.0,0.0,1.945,0.0,0.0,0.0,127.0,729.75,1828.5
50%,4445115000.0,7405.5,5.245,5.245,0.0,0.21,0.24,3.365,0.0,4.0,6.0,199.0,1057.5,2134.0
75%,6962181000.0,10727.0,7.7125,7.71,0.0,2.0525,0.8,4.7825,0.0,32.0,19.0,264.0,1229.5,2793.25
max,8877689000.0,36019.0,28.030001,28.030001,4.942142,21.92,6.48,10.71,0.11,210.0,143.0,518.0,1440.0,4900.0


In [49]:
synthetic_dailyActivity = synthesiser(
    data=dailyActivity,
    id_col='row_id',
    participant_col='Id',
    start_participant_num=34,
    num_new_participants=66
)
dailyActivity_final = pd.concat([dailyActivity, synthetic_dailyActivity], ignore_index=True)
dailyActivity_final.describe()
dailyActivity_final.to_csv("GAN_generated_data/KaggleDataset/dailyActivity_synthesised.csv", index=False)



In [5]:
synthetic_wellness = synthesiser(
    data=wellness_data,
    id_col='row_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
wellness_final = pd.concat([wellness_data, synthetic_wellness], ignore_index=True)
wellness_final.describe()



Unnamed: 0,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,stress,row_id
count,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0
mean,2.55581,3.110337,4.675044,6.812804,3.043658,2.723746,2.649454,3984.763551
std,0.970288,1.009325,2.260581,1.332907,0.984992,0.888269,1.012589,2784.902112
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.0,3.0,6.0,2.0,2.0,2.0,1363.0
50%,3.0,3.0,5.0,7.0,3.0,3.0,3.0,3705.0
75%,3.0,4.0,6.0,7.0,4.0,3.0,3.0,6430.5
max,5.0,5.0,10.0,12.0,5.0,5.0,5.0,9156.0


In [6]:
wellness_final.to_csv("GAN_generated_data/wellness_synthesised_100_participants.csv", index=False)

In [7]:
synthetic_slp_score = synthesiser(
    data=slp_score_data,
    id_col='sleep_log_entry_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
slp_score_final = pd.concat([slp_score_data, synthetic_slp_score], ignore_index=True)
slp_score_final.describe()



Unnamed: 0,sleep_log_entry_id,overall_score,composition_score,revitalization_score,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
count,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0
mean,4098343000.0,71.449264,19.561251,18.990887,37.514108,66.824658,57.343761,0.086773
std,9363037000.0,9.38168,2.40308,3.271264,5.675605,31.710996,7.818489,0.043431
min,1.0,35.0,12.0,5.0,3.0,0.0,44.0,0.015385
25%,2853.75,66.0,17.0,18.0,34.0,45.0,51.0,0.05572
50%,5706.5,72.0,20.0,19.0,39.0,66.0,59.0,0.079653
75%,8559.25,78.0,21.0,21.0,41.0,87.0,63.0,0.109593
max,26543160000.0,94.0,25.0,25.0,47.0,183.0,76.0,0.294766


In [8]:
slp_score_final.to_csv("GAN_generated_data/sleep_score_synthesised_100_participants.csv", index=False)

In [9]:
sleep_data = sleep_data.drop(columns=["levels"])

synthetic_overall_sleep = synthesiser(
    data=sleep_data,
    id_col='row_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
overall_sleep_final = pd.concat([sleep_data, synthetic_overall_sleep], ignore_index=True)
overall_sleep_final.describe()



Unnamed: 0,logId,duration,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,efficiency,infoCode,row_id
count,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0
mean,25568490000.0,340.549535,0.864961,343.828295,38.307519,0.504031,350.353488,95.150465,0.45938,4716.58
std,592509900.0,165.347198,4.289623,156.114124,28.195574,1.50583,165.68929,3.926894,0.817274,3295.99718
min,24472350000.0,60.0,0.0,39.0,0.0,0.0,60.0,0.0,0.0,0.0
25%,25082310000.0,219.0,0.0,265.75,15.0,0.0,253.0,93.0,0.0,1612.75
50%,25545350000.0,360.0,0.0,384.0,38.0,0.0,385.0,96.0,0.0,4386.5
75%,26083900000.0,457.0,0.0,453.0,57.0,0.0,464.0,98.0,0.0,7611.25
max,26554190000.0,953.0,35.0,823.0,170.0,33.0,953.0,100.0,2.0,10836.0


In [10]:
overall_sleep_final.to_csv("GAN_generated_data/overall_sleep_synthesised_100_participants.csv", index=False)

# initial evaluations

In [25]:
slp_score_data = pd.read_csv("../EDA/slp_score_data.csv")
sleep_data = pd.read_csv("../EDA/sleep_data.csv")
wellness_data = pd.read_csv("../EDA/wellness_data.csv")

wellness_synthesised =  pd.read_csv("GAN_generated_data/wellness_synthesised_100_participants.csv")
slp_score_synthesised =  pd.read_csv("GAN_generated_data/sleep_score_synthesised_100_participants.csv")
sleep_synthesised =  pd.read_csv("GAN_generated_data/overall_sleep_synthesised_100_participants.csv")

In [49]:
shapes_df = pd.DataFrame({
    "Dataset": [
        "slp_score_data",
        "sleep_data",
        "wellness_data",
        "slp_score_synthesised",
        "sleep_synthesised",
        "wellness_synthesised"
    ],
    "Shape": [
        slp_score_data.shape,
        sleep_data.shape,
        wellness_data.shape,
        slp_score_synthesised.shape,
        sleep_synthesised.shape,
        wellness_synthesised.shape
    ]
})

print(shapes_df)

                 Dataset        Shape
0         slp_score_data   (1836, 11)
1             sleep_data   (2064, 17)
2          wellness_data   (1747, 11)
3  slp_score_synthesised  (11412, 11)
4      sleep_synthesised  (12900, 17)
5   wellness_synthesised  (10903, 12)


In [27]:
slp_score_data.describe()

Unnamed: 0,sleep_log_entry_id,overall_score,composition_score,revitalization_score,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
count,1836.0,1836.0,1836.0,1836.0,1836.0,1836.0,1836.0,1836.0
mean,25473990000.0,76.460784,19.247277,18.960784,38.252723,73.25817,58.583878,0.090687
std,580274000.0,7.795554,2.390916,3.316146,5.146081,27.728065,7.08962,0.037511
min,24472390000.0,35.0,12.0,5.0,3.0,0.0,44.0,0.015385
25%,24969050000.0,72.0,17.0,17.0,35.0,55.0,53.0,0.062388
50%,25432120000.0,77.0,19.5,20.0,39.0,72.5,59.0,0.084501
75%,25963530000.0,82.0,21.0,21.0,42.0,91.0,65.0,0.110432
max,26543160000.0,94.0,25.0,25.0,47.0,183.0,76.0,0.294766


In [47]:
slp_score_synthesised.describe()

Unnamed: 0,sleep_log_entry_id,overall_score,composition_score,revitalization_score,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
count,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0
mean,4098343000.0,71.449264,19.561251,18.990887,37.514108,66.824658,57.343761,0.086773
std,9363037000.0,9.38168,2.40308,3.271264,5.675605,31.710996,7.818489,0.043431
min,1.0,35.0,12.0,5.0,3.0,0.0,44.0,0.015385
25%,2853.75,66.0,17.0,18.0,34.0,45.0,51.0,0.05572
50%,5706.5,72.0,20.0,19.0,39.0,66.0,59.0,0.079653
75%,8559.25,78.0,21.0,21.0,41.0,87.0,63.0,0.109593
max,26543160000.0,94.0,25.0,25.0,47.0,183.0,76.0,0.294766


In [53]:
ranges = {
    "p01–p16": range(1, 17),
    "p01–p20": range(1, 21),
    "p01–p40": range(1, 41),
    "p01–p60": range(1, 61),
    "p01–p80": range(1, 81),
    "p01–p100": range(1, 101),
}

# Compare describe() for the 'slp_score' column across ranges
summary = pd.DataFrame({
    label: slp_score_synthesised[
        slp_score_synthesised['participant_id'].isin([f'p{str(i).zfill(2)}' for i in rng])
    ]['overall_score'].describe()
    for label, rng in ranges.items()
})

summary

Unnamed: 0,p01–p16,p01–p20,p01–p40,p01–p60,p01–p80,p01–p100
count,1836.0,2292.0,4572.0,6852.0,9132.0,11412.0
mean,76.460784,75.261344,73.027122,72.14711,71.698533,71.449264
std,7.795554,8.450602,9.119334,9.311064,9.346569,9.38168
min,35.0,35.0,35.0,35.0,35.0,35.0
25%,72.0,70.0,68.0,67.0,66.0,66.0
50%,77.0,76.0,74.0,73.0,72.0,72.0
75%,82.0,81.0,79.0,79.0,78.0,78.0
max,94.0,94.0,94.0,94.0,94.0,94.0


In [31]:
sleep_data.describe()

Unnamed: 0,logId,duration,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,efficiency,infoCode
count,2064.0,2064.0,2064.0,2064.0,2064.0,2064.0,2064.0,2064.0,2064.0
mean,25475620000.0,444.038275,0.036337,387.182171,56.526647,0.744671,444.038275,94.335271,0.140504
std,586459800.0,131.210291,0.889502,111.897713,25.232016,2.085014,131.210291,4.588759,0.506503
min,24472350000.0,60.0,0.0,39.0,0.0,0.0,60.0,0.0,0.0
25%,24966290000.0,396.0,0.0,348.0,42.0,0.0,396.0,92.0,0.0
50%,25430270000.0,461.5,0.0,401.0,55.0,0.0,461.5,95.0,0.0
75%,25964420000.0,518.0,0.0,453.0,70.0,0.0,518.0,97.0,0.0
max,26554190000.0,953.0,35.0,823.0,170.0,33.0,953.0,100.0,2.0


In [33]:
sleep_synthesised.describe()

Unnamed: 0,logId,duration,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,efficiency,infoCode,row_id
count,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0
mean,25568490000.0,340.549535,0.864961,343.828295,38.307519,0.504031,350.353488,95.150465,0.45938,4716.58
std,592509900.0,165.347198,4.289623,156.114124,28.195574,1.50583,165.68929,3.926894,0.817274,3295.99718
min,24472350000.0,60.0,0.0,39.0,0.0,0.0,60.0,0.0,0.0,0.0
25%,25082310000.0,219.0,0.0,265.75,15.0,0.0,253.0,93.0,0.0,1612.75
50%,25545350000.0,360.0,0.0,384.0,38.0,0.0,385.0,96.0,0.0,4386.5
75%,26083900000.0,457.0,0.0,453.0,57.0,0.0,464.0,98.0,0.0,7611.25
max,26554190000.0,953.0,35.0,823.0,170.0,33.0,953.0,100.0,2.0,10836.0


In [55]:
ranges = {
    "p01–p16": range(1, 17),
    "p01–p20": range(1, 21),
    "p01–p40": range(1, 41),
    "p01–p60": range(1, 61),
    "p01–p80": range(1, 81),
    "p01–p100": range(1, 101),
}

# Compare describe() for the 'slp_score' column across ranges
summary = pd.DataFrame({
    label: sleep_synthesised[
        sleep_synthesised['participant_id'].isin([f'p{str(i).zfill(2)}' for i in rng])
    ]['duration'].describe()
    for label, rng in ranges.items()
})

summary

Unnamed: 0,p01–p16,p01–p20,p01–p40,p01–p60,p01–p80,p01–p100
count,2064.0,2580.0,5160.0,7740.0,10320.0,12900.0
mean,444.038275,418.395736,370.385271,353.522739,345.489244,340.549535
std,131.210291,146.012053,160.523435,164.308385,165.407477,165.347198
min,60.0,60.0,60.0,60.0,60.0,60.0
25%,396.0,361.0,283.75,249.0,228.75,219.0
50%,461.5,443.0,396.0,377.0,366.0,360.0
75%,518.0,509.0,482.0,470.0,462.0,457.0
max,953.0,953.0,953.0,953.0,953.0,953.0


In [35]:
wellness_data.describe()

Unnamed: 0,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,stress
count,1747.0,1747.0,1747.0,1747.0,1747.0,1747.0,1747.0
mean,2.709216,3.195764,4.946766,6.717802,3.013738,2.82656,2.906125
std,0.67955,0.640844,1.926788,1.338879,0.752114,0.601853,0.722968
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.0,4.0,6.0,3.0,3.0,3.0
50%,3.0,3.0,5.0,7.0,3.0,3.0,3.0
75%,3.0,4.0,6.0,8.0,3.0,3.0,3.0
max,5.0,5.0,10.0,12.0,5.0,5.0,5.0


In [37]:
wellness_synthesised.describe()

Unnamed: 0,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,stress,row_id
count,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0
mean,2.55581,3.110337,4.675044,6.812804,3.043658,2.723746,2.649454,3984.763551
std,0.970288,1.009325,2.260581,1.332907,0.984992,0.888269,1.012589,2784.902112
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.0,3.0,6.0,2.0,2.0,2.0,1363.0
50%,3.0,3.0,5.0,7.0,3.0,3.0,3.0,3705.0
75%,3.0,4.0,6.0,7.0,4.0,3.0,3.0,6430.5
max,5.0,5.0,10.0,12.0,5.0,5.0,5.0,9156.0


In [57]:
ranges = {
    "p01–p16": range(1, 17),
    "p01–p20": range(1, 21),
    "p01–p40": range(1, 41),
    "p01–p60": range(1, 61),
    "p01–p80": range(1, 81),
    "p01–p100": range(1, 101),
}

# Compare describe() for the 'slp_score' column across ranges
summary = pd.DataFrame({
    label: wellness_synthesised[
        wellness_synthesised['participant_id'].isin([f'p{str(i).zfill(2)}' for i in rng])
    ]['sleep_quality'].describe()
    for label, rng in ranges.items()
})

summary

Unnamed: 0,p01–p16,p01–p20,p01–p40,p01–p60,p01–p80,p01–p100
count,1747.0,2183.0,4363.0,6543.0,8723.0,10903.0
mean,3.013738,3.022904,3.046528,3.049824,3.049639,3.043658
std,0.752114,0.809786,0.915198,0.953662,0.977123,0.984992
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,2.0,2.0,2.0,2.0
50%,3.0,3.0,3.0,3.0,3.0,3.0
75%,3.0,4.0,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0,5.0,5.0
