In [1]:
import numpy as np
import pandas as pd
import random
from datetime import datetime

In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of synthetic entries to generate
num_entries = 300

# Create empty lists to store data
data = []

# Generate synthetic data with realistic patterns
for i in range(num_entries):
    # Decide if this entry is a Morning Lark or Night Owl
    # Let's say 55% Morning Larks, 45% Night Owls
    is_morning_lark = random.random() < 0.55

    if is_morning_lark:
        # Morning Lark patterns
        # Q1: What time do you naturally wake up on weekends?
        q1 = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])

        # Q2: When do you feel most energized?
        q2 = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])

        # Q3: How easy is it to wake up before 7:00 AM?
        q3 = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])

        # Q4: When would you prefer to exercise?
        q4 = np.random.choice([2, 3, 4, 5], p=[0.1, 0.2, 0.3, 0.4])

        # Q5: When do you feel most creative and productive?
        q5 = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])

        # Q6: How do you feel about staying up past midnight?
        q6 = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])

        # Q7: Ideal work hours
        q7 = np.random.choice([3, 4, 5], p=[0.2, 0.3, 0.5])

        # Add some noise to make data more realistic
        # Occasionally, Morning Larks might show some Night Owl tendencies
        if random.random() < 0.15:
            noise_question = random.randint(1, 7)
            if noise_question == 1:
                q1 = np.random.choice([0, 1, 2])
            elif noise_question == 2:
                q2 = np.random.choice([0, 1, 2])
            elif noise_question == 3:
                q3 = np.random.choice([0, 1, 2])
            elif noise_question == 4:
                q4 = np.random.choice([0, 1, 2])
            elif noise_question == 5:
                q5 = np.random.choice([0, 1, 2])
            elif noise_question == 6:
                q6 = np.random.choice([0, 1, 2])
            elif noise_question == 7:
                q7 = np.random.choice([0, 1, 2])

        chronotype = "Morning Lark"

    else:
        # Night Owl patterns
        # Q1: What time do you naturally wake up on weekends?
        q1 = np.random.choice([0, 1, 2], p=[0.4, 0.4, 0.2])

        # Q2: When do you feel most energized?
        q2 = np.random.choice([0, 1, 2], p=[0.4, 0.4, 0.2])

        # Q3: How easy is it to wake up before 7:00 AM?
        q3 = np.random.choice([0, 1, 2], p=[0.5, 0.3, 0.2])

        # Q4: When would you prefer to exercise?
        q4 = np.random.choice([0, 1, 2, 3], p=[0.4, 0.3, 0.2, 0.1])

        # Q5: When do you feel most creative and productive?
        q5 = np.random.choice([0, 1, 2], p=[0.4, 0.4, 0.2])

        # Q6: How do you feel about staying up past midnight?
        q6 = np.random.choice([0, 1, 2], p=[0.4, 0.4, 0.2])

        # Q7: Ideal work hours
        q7 = np.random.choice([0, 1, 2], p=[0.5, 0.3, 0.2])

        # Add some noise - occasionally Night Owls might show some Morning Lark tendencies
        if random.random() < 0.15:
            noise_question = random.randint(1, 7)
            if noise_question == 1:
                q1 = np.random.choice([3, 4, 5])
            elif noise_question == 2:
                q2 = np.random.choice([3, 4, 5])
            elif noise_question == 3:
                q3 = np.random.choice([3, 4, 5])
            elif noise_question == 4:
                q4 = np.random.choice([3, 4, 5])
            elif noise_question == 5:
                q5 = np.random.choice([3, 4, 5])
            elif noise_question == 6:
                q6 = np.random.choice([3, 4, 5])
            elif noise_question == 7:
                q7 = np.random.choice([3, 4, 5])

        chronotype = "Night Owl"

    # Create a synthetic entry
    entry = {
        "q1": q1,
        "q2": q2,
        "q3": q3,
        "q4": q4,
        "q5": q5,
        "q6": q6,
        "q7": q7,
        "chronotype": chronotype
    }

    # Add to data list
    data.append(entry)



In [3]:
# Create a DataFrame
df = pd.DataFrame(data)

# Add a timestamp for when this "synthetic" data was collected
df['timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# Add a unique ID for each entry
df['id'] = range(1, num_entries + 1)

# Calculate a confidence score (not used for training but useful for the database)
# Higher score variance = higher confidence
df['score_variance'] = df[['q1', 'q2', 'q3',
                           'q4', 'q5', 'q6', 'q7']].var(axis=1)
df['confidence'] = (df['score_variance'] * 20).clip(60,
                                                    95)  # Scale to a percentage between 60-95%

# Save to CSV
df.to_csv('dozzy_data.csv', index=False)

# Print some statistics
print(f"Generated {num_entries} synthetic entries")
print(f"Morning Larks: {len(df[df['chronotype'] == 'Morning Lark'])}")
print(f"Night Owls: {len(df[df['chronotype'] == 'Night Owl'])}")
print(f"Dataset saved to dozzy_data.csv")

# Display the first few rows as an example
print("\nSample data:")
print(df.head())

Generated 300 synthetic entries
Morning Larks: 170
Night Owls: 130
Dataset saved to dozzy_data.csv

Sample data:
   q1  q2  q3  q4  q5  q6  q7    chronotype            timestamp  id  \
0   0   2   1   1   0   0   0     Night Owl  2025-03-14 02:08:53   1   
1   2   1   1   0   2   3   0     Night Owl  2025-03-14 02:08:53   2   
2   1   1   0   0   1   0   0     Night Owl  2025-03-14 02:08:53   3   
3   2   0   0   1   0   2   0     Night Owl  2025-03-14 02:08:53   4   
4   5   5   4   2   5   4   4  Morning Lark  2025-03-14 02:08:53   5   

   score_variance  confidence  
0        0.619048        60.0  
1        1.238095        60.0  
2        0.285714        60.0  
3        0.904762        60.0  
4        1.142857        60.0  
