In [36]:
# Let's regenerate with device as a core characteristic
np.random.seed(42)
n_users = 50000

# Realistic device distribution 
device_dist = np.random.choice(['mobile', 'desktop'], n_users, p=[0.70, 0.30])

# Device affects conversion rates (mobile converts less)
mobile_control_rate = 0.020    # 2.0% mobile baseline
mobile_treatment_rate = 0.024  # 2.4% mobile treatment
desktop_control_rate = 0.028   # 2.8% desktop baseline  
desktop_treatment_rate = 0.031 # 3.1% desktop treatment

# Now generate conversions based on BOTH group AND device
conversions = []
groups = []

for i in range(n_users):
    group = 'treatment' if i % 2 == 0 else 'control'
    device = device_dist[i]
    
    if group == 'control':
        rate = mobile_control_rate if device == 'mobile' else desktop_control_rate
    else:
        rate = mobile_treatment_rate if device == 'mobile' else desktop_treatment_rate
        
    converted = np.random.binomial(1, rate)
    conversions.append(converted)
    groups.append(group)

# Create proper dataset
data_proper = pd.DataFrame({
    'user_id': range(n_users),
    'group': groups,
    'device': device_dist,
    'converted': conversions
})

print("Properly generated dataset:")
print(data_proper.groupby(['device', 'group'])['converted'].agg(['count', 'mean']))
print(data_proper.head)

Properly generated dataset:
                   count      mean
device  group                     
desktop control     7440  0.023118
        treatment   7405  0.029440
mobile  control    17560  0.019419
        treatment  17595  0.023586
<bound method NDFrame.head of        user_id      group   device  converted
0            0  treatment   mobile          0
1            1    control  desktop          0
2            2  treatment  desktop          0
3            3    control   mobile          0
4            4  treatment   mobile          0
...        ...        ...      ...        ...
49995    49995    control   mobile          0
49996    49996  treatment   mobile          0
49997    49997    control  desktop          0
49998    49998  treatment   mobile          0
49999    49999    control  desktop          0

[50000 rows x 4 columns]>


In [38]:
data_proper.to_csv('./data/basic_ab_test.csv', index=False)

# Let's verify what we saved
data_saved = pd.read_csv('./data/basic_ab_test.csv')
print("Columns in saved file:", data_saved.columns.tolist())
print("Shape:", data_saved.shape)
print("\nFirst few rows:")
print(data_saved.head())
print(f"\nVerification - Saved dataset shape: {data_saved.shape}")
print(f"Columns: {data_saved.columns.tolist()}")
print(data_saved.head)
print(data_saved.groupby(['device', 'group']).size())

Columns in saved file: ['user_id', 'group', 'device', 'converted']
Shape: (50000, 4)

First few rows:
   user_id      group   device  converted
0        0  treatment   mobile          0
1        1    control  desktop          0
2        2  treatment  desktop          0
3        3    control   mobile          0
4        4  treatment   mobile          0

Verification - Saved dataset shape: (50000, 4)
Columns: ['user_id', 'group', 'device', 'converted']
<bound method NDFrame.head of        user_id      group   device  converted
0            0  treatment   mobile          0
1            1    control  desktop          0
2            2  treatment  desktop          0
3            3    control   mobile          0
4            4  treatment   mobile          0
...        ...        ...      ...        ...
49995    49995    control   mobile          0
49996    49996  treatment   mobile          0
49997    49997    control  desktop          0
49998    49998  treatment   mobile          0
49999    