In [1]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd



In [2]:
slp_score_data = pd.read_csv("../EDA/slp_score_data.csv")
sleep_data = pd.read_csv("../EDA/sleep_data.csv")
wellness_data = pd.read_csv("../EDA/wellness_data.csv")

processed_summary= pd.read_csv("../EDA/sleep_processed_summary.csv")
processed_data= pd.read_csv("../EDA/sleep_processed_data.csv")
processed_shortdata= pd.read_csv("../EDA/sleep_processed_shortdata.csv")

In [3]:
wellness_data

Unnamed: 0,effective_time_frame,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,soreness_area,stress,participant_id,date
0,2019-11-01 08:31:40.751000+00:00,2,3,5,6,3,2,[12921003],3,p01,2019-11-01
1,2019-11-02 10:00:01.229000+00:00,2,3,6,6,3,2,[12921003],3,p01,2019-11-02
2,2019-11-03 14:28:03.263000+00:00,3,3,8,6,3,3,[],3,p01,2019-11-03
3,2019-11-04 07:05:28.429000+00:00,3,3,8,6,3,3,[],3,p01,2019-11-04
4,2019-11-05 06:13:35.998000+00:00,3,3,8,5,3,3,[],3,p01,2019-11-05
...,...,...,...,...,...,...,...,...,...,...,...
1742,2020-03-08 11:44:44.398000+00:00,3,3,4,6,2,3,[],3,p16,2020-03-08
1743,2020-03-11 11:33:15.168000+00:00,2,3,4,6,2,3,[],2,p16,2020-03-11
1744,2020-03-15 08:33:15.985000+00:00,4,4,4,10,4,4,[],4,p16,2020-03-15
1745,2020-03-17 09:54:27.989000+00:00,2,3,4,5,3,3,[],3,p16,2020-03-17


In [4]:
def synthesiser(data, id_col, participant_col, start_participant_num, num_new_participants):
    """
    Generate synthetic data for new participants using CTGAN.

    Parameters:
    - data (pd.DataFrame): Original dataset.
    - id_col (str): Column name to be treated as unique row ID (e.g., 'row_id').
    - participant_col (str): Column name representing participant IDs.
    - start_participant_num (int): Number to start naming new participants from (e.g., 17 for 'p17').
    - num_new_participants (int): How many new participants to synthesize.

    Returns:
    - pd.DataFrame: Synthetic data for the new participants.
    """
    
    # Step 0: Assign unique row ID if not already
    if id_col not in data.columns:
        data[id_col] = range(len(data))
    
    # Step 1: Setup metadata
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data)
    metadata.update_column(column_name=id_col, sdtype='id')
    metadata.set_primary_key(column_name=id_col)
    metadata.update_column(column_name=participant_col, sdtype='categorical')

    # Step 2: Fit the synthesizer
    synthesizer = CTGANSynthesizer(metadata)
    synthesizer.fit(data)

    # Step 3: Estimate rows per participant and sample synthetic data
    original_participant_count = data[participant_col].nunique()
    rows_per_participant = len(data) // original_participant_count
    synthetic_sample_size = rows_per_participant * num_new_participants
    synthetic_data = synthesizer.sample(synthetic_sample_size)

    # Step 4: Generate new participant IDs
    new_ids = [f"p{i}" for i in range(start_participant_num, start_participant_num + num_new_participants)]

    # Step 5: Build new synthetic dataset
    new_participant_data = pd.DataFrame()

    for new_id in new_ids:
        subset = synthetic_data.sample(rows_per_participant, replace=True).copy()
        subset[participant_col] = new_id
        new_participant_data = pd.concat([new_participant_data, subset], ignore_index=True)

    # Step 6: Reassign row IDs
    new_participant_data[id_col] = range(1, len(new_participant_data) + 1)

    return new_participant_data

In [5]:
synthetic_wellness = synthesiser(
    data=wellness_data,
    id_col='row_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
wellness_final = pd.concat([wellness_data, synthetic_wellness], ignore_index=True)
wellness_final.describe()



Unnamed: 0,fatigue,mood,readiness,sleep_duration_h,sleep_quality,soreness,stress,row_id
count,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0,10903.0
mean,2.55581,3.110337,4.675044,6.812804,3.043658,2.723746,2.649454,3984.763551
std,0.970288,1.009325,2.260581,1.332907,0.984992,0.888269,1.012589,2784.902112
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,3.0,3.0,6.0,2.0,2.0,2.0,1363.0
50%,3.0,3.0,5.0,7.0,3.0,3.0,3.0,3705.0
75%,3.0,4.0,6.0,7.0,4.0,3.0,3.0,6430.5
max,5.0,5.0,10.0,12.0,5.0,5.0,5.0,9156.0


In [6]:
wellness_final.to_csv("GAN_generated_data/wellness_synthesised_100_participants.csv", index=False)

In [7]:
synthetic_slp_score = synthesiser(
    data=slp_score_data,
    id_col='sleep_log_entry_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
slp_score_final = pd.concat([slp_score_data, synthetic_slp_score], ignore_index=True)
slp_score_final.describe()



Unnamed: 0,sleep_log_entry_id,overall_score,composition_score,revitalization_score,duration_score,deep_sleep_in_minutes,resting_heart_rate,restlessness
count,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0,11412.0
mean,4098343000.0,71.449264,19.561251,18.990887,37.514108,66.824658,57.343761,0.086773
std,9363037000.0,9.38168,2.40308,3.271264,5.675605,31.710996,7.818489,0.043431
min,1.0,35.0,12.0,5.0,3.0,0.0,44.0,0.015385
25%,2853.75,66.0,17.0,18.0,34.0,45.0,51.0,0.05572
50%,5706.5,72.0,20.0,19.0,39.0,66.0,59.0,0.079653
75%,8559.25,78.0,21.0,21.0,41.0,87.0,63.0,0.109593
max,26543160000.0,94.0,25.0,25.0,47.0,183.0,76.0,0.294766


In [8]:
slp_score_final.to_csv("GAN_generated_data/sleep_score_synthesised_100_participants.csv", index=False)

In [9]:
sleep_data = sleep_data.drop(columns=["levels"])

synthetic_overall_sleep = synthesiser(
    data=sleep_data,
    id_col='row_id',
    participant_col='participant_id',
    start_participant_num=17,
    num_new_participants=84
)
overall_sleep_final = pd.concat([sleep_data, synthetic_overall_sleep], ignore_index=True)
overall_sleep_final.describe()



Unnamed: 0,logId,duration,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,timeInBed,efficiency,infoCode,row_id
count,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0,12900.0
mean,25568490000.0,340.549535,0.864961,343.828295,38.307519,0.504031,350.353488,95.150465,0.45938,4716.58
std,592509900.0,165.347198,4.289623,156.114124,28.195574,1.50583,165.68929,3.926894,0.817274,3295.99718
min,24472350000.0,60.0,0.0,39.0,0.0,0.0,60.0,0.0,0.0,0.0
25%,25082310000.0,219.0,0.0,265.75,15.0,0.0,253.0,93.0,0.0,1612.75
50%,25545350000.0,360.0,0.0,384.0,38.0,0.0,385.0,96.0,0.0,4386.5
75%,26083900000.0,457.0,0.0,453.0,57.0,0.0,464.0,98.0,0.0,7611.25
max,26554190000.0,953.0,35.0,823.0,170.0,33.0,953.0,100.0,2.0,10836.0


In [10]:
overall_sleep_final.to_csv("GAN_generated_data/overall_sleep_synthesised_100_participants.csv", index=False)