In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
import random
from timeit import default_timer as timer

## Generating Users and their Characteristics

In [2]:
df = pd.DataFrame()
length = 100000

In [3]:
df["Gender"] = np.random.choice([1, 0], length, p=[0.5, 0.5])

In [4]:
df["AgeGroup"] = np.random.choice([0,1,2,3,4,5,6,'NA'], length, p = [.08,.12,.18,.22,.18,.12,.08,.02])

In [5]:
df["Program"] = np.random.choice(["A","B","C","D","E","NA"], length, p = [.2,.03,.17,.25,.30,.05])

In [6]:
df["Income"] = np.random.randint(50000, 500000, length)

In [7]:
df["State"] = np.random.choice(["NJ","NY","CA","TX","NA"], length, p = [.2,.3,.35,.10,.05])

In [8]:
df["Device"] = np.random.choice(["Mobile","Desktop","Tablet"], length, p = [.6,.35,.05])

In [9]:
df["OS"] = np.random.choice(["Apple","Windows"], length, p = [.5,.5])

In [10]:
df["Browser"] = np.random.choice(["Opera","Chrome","Safari"], length, p = [.1,.6,.3])

## We will give each person an equal chance of seeing options A & B

In [11]:
df["option"] = np.random.choice(["A", "B"], length, p=[0.5, 0.5])

## And set the overall Click-Through-Rate = 10%

In [12]:
df["click"] = np.random.choice([1, 0], length, p=[0.1, 0.9])

In [13]:
df.head()

Unnamed: 0,Gender,AgeGroup,Program,Income,State,Device,OS,Browser,option,click
0,0,3,C,200447,NJ,Desktop,Apple,Chrome,A,1
1,0,3,D,473011,CA,Desktop,Windows,Chrome,A,0
2,0,4,D,337266,CA,Mobile,Apple,Chrome,A,0
3,1,1,E,167260,NJ,Desktop,Apple,Chrome,A,0
4,1,1,D,364587,CA,Tablet,Apple,Chrome,B,0


## Defining Segments with Different Response Rates

#### Users with Gender == 1 & Device == Mobile will have a 5% Click-Through-Rate when shown option B 

In [14]:
n = df.loc[(df['Gender'] == 1) & (df['Device'] == 'Mobile') & (df['option'] == 'B'),"click"].shape[0]
df.loc[(df['Gender'] == 1) & (df['Device'] == 'Mobile') & (df['option'] == 'B'),"click"] = np.random.choice([1, 0], n, p=[0.05, 0.95])

#### Users with Gender == 0 & Income > 150000 will have a 5% Click-Through-Rate when shown option A 

In [15]:
n = df.loc[(df['Gender'] == 0) & (df['Income'] > 150000) & (df['option'] == 'A'),"click"].shape[0]
df.loc[(df['Gender'] == 0) & (df['Income'] > 150000) & (df['option'] == 'A'),"click"] = np.random.choice([1, 0], n, p=[0.05, 0.95])

#### Users with State from NJ or CA & OS == Apple & AgeGroup from 0 or 1 will have a 7% Click-Through-Rate when shown option B 

In [16]:
n = df.loc[(df['State'].isin(["NJ","CA"])) & (df['OS'] == 'Apple') & (df['AgeGroup'].isin([0,1])) & (df['option'] == 'B'),"click"].shape[0]
df.loc[(df['State'].isin(["NJ","CA"])) & (df['OS'] == 'Apple') & (df['AgeGroup'].isin([0,1])) & (df['option'] == 'B'),"click"] = np.random.choice([1, 0], n, p=[0.07, 0.93])

#### Users with AgeGroup of 5 or 6 will have a 5% Click-Through-Rate when shown option A 

In [17]:
n = df.loc[df['AgeGroup'].isin([5,6])  & (df['option'] == 'A'),"click"].shape[0]
df.loc[df['AgeGroup'].isin([5,6]) & (df['option'] == 'A'),"click"] = np.random.choice([1, 0], n, p=[0.05, 0.95])

#### Users with Program == B & Browser == Safari & Income < 100000 will have a 5% Click-Through-Rate when shown option B 

In [18]:
n = df.loc[(df['Program'] == 'B') & (df['Browser'] == 'Safari') & (df['Income'] < 100000) & (df['option'] == 'B'),"click"].shape[0]
df.loc[(df['Program'] == 'B') & (df['Browser'] == 'Safari') & (df['Income'] < 100000) & (df['option'] == 'B'),"click"] = np.random.choice([1, 0], n, p=[0.05, 0.95])

#### Users with Program == C & Browser == Chrome & Income > 350000 will have a 3% Click-Through-Rate when shown option A 

In [19]:
n = df.loc[(df['Program'] == 'C') & (df['Browser'] == 'Chrome') & (df['Income'] > 350000) & (df['option'] == 'A'),"click"].shape[0]
df.loc[(df['Program'] == 'C') & (df['Browser'] == 'Chrome') & (df['Income'] > 350000) & (df['option'] == 'A'),"click"] = np.random.choice([1, 0], n, p=[0.03, 0.97])

## Write to CSV

In [20]:
df.to_csv('simulation_data.csv',index=False)