In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def process(data):
    median_y = data['y'].sort_values(ignore_index=True)[len(data)/2]
    mean_y = sum(data['y']) / len(data)

    mask = data['y'] > median_y
    sucess, slen = data[mask], len(data[mask])
    failed, flen = data[mask == False], len(data) - sucess
    return median_y, mean_y, sucess, slen, failed, flen

def probability(data, sucess, failed):
    """
    p(x | sucess) & p(x | failed)
    """
    p = {"x":[], "sp":[], "fp":[]}
    minc, maxc = int(min(data['x'])), int(max(data['x'])+1)
    step = 5
    for x in np.arange(minc, maxc, step):
        mask = (data['x'] < x+step/2) & (x-step/2 <= data['x'])
        sp = len(sucess[mask]) / len(sucess)
        fp = len(failed[mask]) / len(failed)
        p['x'].append(x)
        p['sp'].append(sp)
        p['fp'].append(fp)
    return pd.DataFrame(p)

def bayes(data,sucess, failed, p):
    """
    p(sucess | x) & p(filaed | x)
    """
    bayes = {"x":[], "spc":[], "fpc":[]}
    for index, item in p.iterrows():
        mask = (data['x'] < item['x']+2.5) & (item['x']-2.5 <= data['x'])
        spc = (len(sucess)/len(data)) * item['sp'] / (len(data[mask])/len(data))
        fpc = (len(failed)/len(data)) * item['fp'] / (len(data[mask])/len(data))
        bayes['x'].append(item['x'])
        bayes['spc'].append(spc)
        bayes['fpc'].append(fpc)
    return pd.DataFrame(bayes)

def interval_data(df, column, group):
    """
    自訂 x 軸分資料區間，區間內對 ctr, y, watchtimes, impressions 取平均值
    """
    df = df.sort_values(column)
    start =min(df[column].values)
    end = max(df[column].values)
    step = len(df)//(group-1)
    interval = list(df[0::step][column])
    data = {f"interval_{column}":interval}
    y_mean = []
    y_std = []
    
    for i in range(len(interval)):
        mask = (df[column] < interval[i]) & (df[column] >= interval[i-1])
        
        inv_y = np.mean(df[mask]["y"])
        inv_y_std = np.std((df[mask]["y"]), ddof=1)
        
        y_mean.append(inv_y), y_std.append(inv_y_std)
        
    data.update([("y_mean", y_mean)])
    data.update([("y_std", y_std)])
    data = pd.DataFrame(data).drop([0])
    return data

def c_data(df, column,group):
    """
    自訂 x 軸分資料區間，區間內對 ctr, y, watchtimes, impressions 取平均值
    """
    df = df.sort_values(column)
    start =min(df[column].values)
    end = max(df[column].values)
    step = (-start+end)//group
    interval = list(np.arange(start,end,step))
    data = {f"interval_{column}":interval[1:]}
    y_mean = []
    y_std = []
    
    for i in range(len(interval)-1):
        upper = interval[i+1]
        bound = interval[i]
        mask = (df[column] >= bound) & (df[column] < upper)
        
        inv_y = np.mean(df[mask]["y"])
        inv_y_std = np.std((df[mask]["y"]), ddof=1)
        
        y_mean.append(inv_y), y_std.append(inv_y_std)
        
    data.update([("y_mean",y_mean)])
    data.update([("y_std",y_std)])
    data = pd.DataFrame(data)
    return data

# Exist data

In [None]:
e_data = pd.read_csv("taitra_all/first_test/表格資料_edit_thumbnails.csv")
e_data = pd.DataFrame({'x':e_data['x'], 'y':e_data['y']})
emedian_y, emean_y, esucess, eslen, efailed, eflen = process(e_data)
print("中位觀看:",emedian_y)
plt.figure(facecolor="gray")
plt.scatter(esucess['x'],esucess['y'])
plt.scatter(efailed['x'],efailed['y'], color='r')
plt.xlim(22,100)

In [None]:
# p(x | sucess) & p(x | failed)
ep = probability(e_data, esucess, efailed)
plt.figure(facecolor='gray')
plt.plot(ep['x'], ep['sp'])
plt.plot(ep['x'], ep['fp'], color = 'r')

legend = plt.legend(['above','below'])
legend._legend_box.sep = 20
plt.title("p(x | sucess) & p(x | failed)")
plt.ylabel("probability")
plt.xlabel("x")
plt.xlim(22,100)

In [None]:
# p(sucess | x) & p(filaed | x)
ebayes = bayes(e_data, esucess, efailed, ep)
# bayes
plt.figure(facecolor="gray")
plt.plot(ebayes['x'], ebayes['spc'])
plt.plot(ebayes['x'], ebayes['fpc'], color = 'r')
legend = plt.legend(['above','below'])
legend._legend_box.sep = 20
plt.title("p(sucess | x) & p(filaed | x)")
plt.ylabel("probability")
plt.xlabel("x")
plt.xlim(22,100)

In [None]:
plt.figure(facecolor="gray")
plt.bar(ebayes['x'], 1, width=5)
plt.bar(ebayes['x'], ebayes['fpc'], width=5, color='r', edgecolor='w')
plt.title("p(sucess | x) & p(filaed | x)")
plt.ylabel("probability")
plt.xlabel("x")
legend = plt.legend(['above','below'])

In [None]:
count = {"x":[], "count":[]}
for c in ebayes['x']:
    mask = (e_data['x'] < c+2.5) & (e_data['x'] >= c-2.5)
    count['count'].append(len(e_data[mask]))
    count['x'].append(c)
    
count = pd.DataFrame(count)
plt.plot(count['x'], count['count'])

In [None]:
e_interval = c_data(e_data, 'x', 5)
plt.figure(facecolor="gray")
plt.plot(e_interval['interval_x'],e_interval['y_mean'])
plt.ylabel("y")
plt.xlabel("x")

# ABtest data

In [None]:
y = [
    57.0, 121.0, 290.0, 5.0, 84.0, 35.0, 
    48.0, 16.0, 95.0, 3.0, 53.0, 106.0,
    76.0, 48.0, 10.0, 89.0, 16.0, 211.0,
    9.0, 1.0, 16.0, 89.0, 2.0, 0.0, 
    2.0, 43.0, 129.0, 80.0, 105.0, 52.0,
    22.0, 2.0, 11.0, 58.0, 24.0, 3.0,
    1.0, 5.0, 87.0, 21.0, 129.0, 249.0,
    2.0, 12.0, 0.0, 0, 119.0, 30.0,
    84.0, 36.0, 26.0, 7.0, 124.0, 159.0,
    66.0, 65.0, 33.0, 17.0, 66.0, 135.0,
    0.0, 27.0, 2.0, 3.0, 0, 75.0, 54.0,
    39.0, 63.0, 12.0, 52.0, 54.0
    ]

x = [
    186, 333,152, 142, 1646, 584,
    179, 294,87, 144, 897, 269,
    248, 147,93, 242, 148, 104,
    186, 169,125, 171, 149, 81,
    126, 167,127, 91, 139, 882,
    284, 108,136, 72, 361, 164,
    124, 178,88, 118, 202, 140,
    137, 283,50, 85, 138, 531,
    311, 130,139, 142, 154, 177,
    155, 106,154, 266, 696, 104,
    216, 289,127, 102, 99, 65,
    274, 223,155, 107, 631, 483,
]

x = [
    38, 50, 61.6, 62, 39, 50,
    70, 70, 44, 84, 43, 57,
    61, 64, 44, 54, 44, 65,
    60, 64, 38, 55, 41, 69,
    43, 90, 46, 80, 75, 94,
    45, 90, 52, 79, 47, 93,
    58, 73, 46, 83, 53, 57,
    46, 61, 70, 71, 45, 55,
    46, 61, 39.5, 56, 45, 46,
    48, 58, 47, 43, 47, 33,
    48, 93, 44, 86, 47.5, 102,
    48, 93, 43, 86, 47.5, 102
    ]

AB_data = pd.DataFrame({"x":x, "y":y})
ABmedian_y, ABmean_y, ABsucess, ABslen, ABfailed, ABflen = process(AB_data)
plt.figure(facecolor="gray")
plt.scatter(ABsucess['x'], ABsucess['y'])
plt.scatter(ABfailed['x'], ABfailed['y'], color='r')
plt.xlim(22,100)

In [None]:
ABp = probability(AB_data, ABsucess, ABfailed)
plt.figure(facecolor="gray")
plt.plot(ABp['x'], ABp['sp'])
plt.plot(ABp['x'], ABp['fp'], color = 'r')
legend = plt.legend(['above','below'])
legend._legend_box.sep = 20
plt.title("p(x | sucess) & p(x | failed)")
plt.ylabel("probability")
plt.xlabel("x")
plt.xlim(22,100)

In [None]:
ABbayes = bayes(AB_data, ABsucess, ABfailed, ABp)
# bayes
plt.figure(facecolor="gray")
plt.plot(ABbayes['x'], ABbayes['spc'])
plt.plot(ABbayes['x'], ABbayes['fpc'], color = 'r')
legend = plt.legend(['above','below'])
legend._legend_box.sep = 20
plt.title("p(sucess | x) & p(filaed | x)")
plt.ylabel("probability")
plt.xlabel("x")
plt.xlim(22,100)

In [None]:
plt.figure(facecolor="gray")
plt.title("p(sucess | x) & p(filaed | x)")
plt.bar(ABbayes['x'], 1, width=5)
plt.bar(ABbayes['x'], ABbayes['fpc'], width=5, color='r', edgecolor='w')
plt.ylabel("probability")
plt.xlabel("x")
legend = plt.legend(['above','below'])

In [None]:
count = {"x":[], "count":[]}
for c in ABbayes['x']:
    mask = (AB_data['x'] < c+2.5) & (AB_data['x'] >= c-2.5)
    count['count'].append(len(AB_data[mask]))
    count['x'].append(c)
    
count = pd.DataFrame(count)
count
plt.plot(count['x'], count['count'])

In [None]:
AB_interval = c_data(AB_data, 'x', 5)
plt.figure(facecolor="gray")
plt.plot(AB_interval ['interval_x'],AB_interval['y_mean'])
plt.ylabel("y")
plt.xlabel("x")

# t 統計差異

In [None]:
import scipy.stats

def t_value(df, column, label, bound = 38):
    a = np.mean(df[df[column] <= bound][label])
    b = np.mean(df[df[column] > bound][label])

    astd = np.std(df[df[column] <= bound][label])
    bstd = np.std(df[df[column] > bound][label])

    na = len(df[df[column] <= bound])
    nb = len(df[df[column] > bound])
    t = (a-b)/np.sqrt((astd**2/na) + (bstd**2/nb))
    p = scipy.stats.t.sf(abs(t), df=na+nb-2)*2
    return t,p

In [None]:
def binary(data, median):
    data_binary={"x":[],"y":[]}
    for index, item in data.iterrows():
        data_binary['x'].append(item['x'])
        data_binary['y'].append(1) if item['y'] > median else data_binary['y'].append(0)
    return pd.DataFrame(data_binary)
AB_binary = binary(AB_data, ABmedian_y)
e_binary = binary(e_data, emedian_y)

In [None]:
t_list, p_list = [],[]
for bound in range(35,60):
    t,p = t_value(AB_binary, 'x', 'y', bound)
    t_list.append(t)
    p_list.append(p)
    if p <= 0.05 and (t > 1 or t < -1) and t != float("inf"):
        print(f"{bound}:\n","t 分數:%.5f"% t,"\n p value:%.6f"% p)

plt.figure(facecolor='gray')
plt.plot(range(35,60,1),p_list,c="r",label="p_value")
plt.plot(range(35,60,1),t_list,label="t_score")
plt.legend()
plt.title("significance")
plt.show()

In [None]:
t_list, p_list = [],[]
for bound in range(35,60):
    t,p = t_value(e_binary, 'x', 'y', bound)
    t_list.append(t)
    p_list.append(p)
    if p <= 0.05 and (t > 1 or t < -1) and t != float("inf"):
        print(f"{bound}:\n","t 分數:%.5f"% t,"\n p value:%.6f"% p)

plt.figure(facecolor='gray')
plt.plot(range(35,60,1),p_list,c="r",label="p_value")
plt.plot(range(35,60,1),t_list,label="t_score")
plt.legend()
plt.title("significance")
plt.show()