## Import Library

In [1]:
import pandas as pd
import numpy as np

## Import Dataset

In [2]:
df = pd.read_csv('ab_data.csv')

## Quick Look

In [3]:
df.shape

(294478, 5)

In [4]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [5]:
df.shape

(294478, 5)

### Sample Size using Slovin

In [5]:
N = 294478 # jumlah populasinya
e = 0.05 # parameter margin of errornya

In [6]:
slovin = N/(1+N * e**2)

In [7]:
def slovin(N,e):
    return N/(1+N * e**2)

In [8]:
slovin(N,e)

399.45740272248173

### Sample 400 observation

In [38]:
df_sampel1 = df.sample(n=400)

In [39]:
df_sampel1['group'].value_counts()

control      215
treatment    185
Name: group, dtype: int64

In [40]:
df_sampel1['user_id'].head()

229203    803517
292348    830300
24930     742453
88886     934099
244099    844823
Name: user_id, dtype: int64

### Sample using Same Randomness

In [36]:
sample_ = df.sample(n=400, random_state=444)
sample_.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
73493,727997,2017-01-10 09:02:05.656060,control,old_page,0
246689,846825,2017-01-20 11:13:31.255643,control,old_page,0
238718,653894,2017-01-16 02:53:57.892203,treatment,new_page,0
275603,787169,2017-01-19 16:01:15.019885,control,old_page,0
120125,658763,2017-01-12 12:47:01.656529,treatment,new_page,0


In [37]:
sample_['group'].value_counts(normalize=True)

control      0.53
treatment    0.47
Name: group, dtype: float64

### Sample 30% from dataset

In [26]:
df.sample(frac=0.3,random_state = 444)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
73493,727997,2017-01-10 09:02:05.656060,control,old_page,0
246689,846825,2017-01-20 11:13:31.255643,control,old_page,0
238718,653894,2017-01-16 02:53:57.892203,treatment,new_page,0
275603,787169,2017-01-19 16:01:15.019885,control,old_page,0
120125,658763,2017-01-12 12:47:01.656529,treatment,new_page,0
...,...,...,...,...,...
63049,883012,2017-01-16 17:38:08.724601,control,old_page,0
53747,770615,2017-01-20 23:43:25.811417,control,old_page,0
31559,686495,2017-01-19 18:29:40.181351,treatment,new_page,0
257905,844938,2017-01-20 17:35:14.574455,control,old_page,1


# Sampling for Each Group / Stratified Random Sampling

In [32]:
df.groupby(['group'],as_index=False).count()

Unnamed: 0,group,user_id,timestamp,landing_page,converted
0,control,147202,147202,147202,147202
1,treatment,147276,147276,147276,147276


In [33]:
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [41]:
df_control = df[df['group'] == 'control']
df_treatment = df[df['group'] == 'treatment']

df_sample_control = df_control.sample(200)
df_sample_treatment = df_treatment.sample(200)

In [42]:
df_sample_control.shape

(200, 5)

In [43]:
df_sample_treatment.shape

(200, 5)

In [44]:
# cara lain
df_sample2 = df.groupby(['group']).apply(lambda x: x.sample(n=200))

In [45]:
df_sample2

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,group,landing_page,converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,269546,834198,2017-01-12 07:31:33.155499,control,old_page,0
control,130878,634566,2017-01-18 19:41:02.330242,control,old_page,0
control,130192,771689,2017-01-09 22:51:54.433596,control,old_page,1
control,153123,647792,2017-01-18 11:05:59.412109,control,old_page,0
control,209416,732680,2017-01-06 16:09:14.341738,control,old_page,0
...,...,...,...,...,...,...
treatment,273696,667748,2017-01-09 01:04:14.080194,treatment,new_page,0
treatment,144953,886266,2017-01-07 19:20:41.910752,treatment,new_page,1
treatment,224595,812437,2017-01-16 20:04:12.283389,treatment,new_page,0
treatment,189033,918568,2017-01-20 23:43:47.510151,treatment,new_page,0


In [46]:
df_sample2['group'].value_counts()

treatment    200
control      200
Name: group, dtype: int64

In [35]:
df_sample3 = df.groupby(['group']).apply(lambda x: x.sample(frac=0.3,random_state=123))

In [36]:
df_sample3

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,timestamp,group,landing_page,converted
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,95574,704344,2017-01-08 06:33:15.620318,control,old_page,0
control,282637,903218,2017-01-07 16:40:31.904242,control,old_page,0
control,201262,724634,2017-01-05 18:38:31.257679,control,old_page,0
control,93315,750623,2017-01-21 19:20:32.814948,control,old_page,0
control,16163,651056,2017-01-04 03:17:39.846424,control,old_page,0
...,...,...,...,...,...,...
treatment,264505,714662,2017-01-12 15:39:20.131076,treatment,new_page,1
treatment,129599,647536,2017-01-23 04:47:37.602495,treatment,new_page,0
treatment,69535,878755,2017-01-12 13:25:18.111209,treatment,new_page,0
treatment,18133,894162,2017-01-22 07:15:51.794428,treatment,new_page,0


In [37]:
df_sample3['group'].value_counts()

treatment    44183
control      44161
Name: group, dtype: int64

In [44]:
df['user_id']

0         851104
1         804228
2         661590
3         853541
4         864975
           ...  
294473    751197
294474    945152
294475    734608
294476    697314
294477    715931
Name: user_id, Length: 294478, dtype: int64

In [56]:
N = 1000
e = 0.05

In [58]:
slovin = N / (1+ N * e**2)

In [59]:
slovin

285.71428571428567

In [60]:
286/1000

0.286