In [1]:
import pandas as pd
import numpy as np
import  os

In [2]:
os.chdir("taitra_all")

<h2 style="text-align:center"> Box plot interval </h2>

In [14]:
def box_xinterval(file_name, column):
    """
    計算盒狀圖四分位數和最大最小值
    """
    df = pd.read_csv(file_name)
    long = len(df)
    X = df[column].sort_values()
    quater1 = X.iloc[long//4]
    quater2 = X.iloc[long//2]
    quater3 = X.iloc[3*long//4]
    qr = quater3 - quater1
    mini = quater1 - 1.5*qr
    big = quater3 + 1.5*qr
    box = [mini, quater1, quater2, quater3, big]
    return box


f1_box = box_xinterval("表格資料_edit_thumbnails.csv", "f1-score")
contrast_box = box_xinterval("表格資料_edit_thumbnails.csv", "contrast")
wcag_box = box_xinterval("表格資料_edit_thumbnails.csv", "wcag_contrast")
colorfulness_box = box_xinterval("表格資料_edit_thumbnails.csv", "colorfulness")
textlen_box = box_xinterval("表格資料_edit_thumbnails.csv", "thumbnail_text_len")
likes_box = box_xinterval("表格資料_edit_thumbnails.csv", "likes")
watch_percent_box = box_xinterval("表格資料_edit_thumbnails.csv", "watch_percent")

In [15]:
textlen_box

[-135.0, 96, 195, 250, 481.0]

In [16]:
def interval_data(df, column, start, end, step):
    """
    自訂 x 軸分資料區間，區間內對 ctr, views, watchtimes, impressions 取平均值
    """
    interval = list(np.arange(start+step, end+step, step))
    data = {f"interval_{column}":interval}
    ctr_mean, views_mean, watchtimes_mean, impressions_mean = [],[],[],[]
    ctr_std, views_std, watchtimes_std, impressions_std = [],[],[],[]
    for i in np.arange(start+step, end+step, step):
        mask = (df[f"{column}"] <= i) & (df[f"{column}"] > i-step)
        inv_ctr = np.mean(df[mask]["CTR"])
        inv_ctr_std = np.std(df[mask]["CTR"], ddof=1)
        
        inv_view = np.mean(df[mask]["views"])
        inv_view_std = np.std((df[mask]["views"]), ddof=1)
        
        inv_watch = np.mean(df[mask]["watchtimes(hr)"])
        inv_watch_std = np.std(df[mask]["watchtimes(hr)"], ddof=1)
        
        inv_impression = np.mean(df[mask]["impressions"])
        inv_impression_std = np.std(df[mask]["impressions"], ddof=1)
        
        ctr_mean.append(inv_ctr), ctr_std.append(inv_ctr_std)
        views_mean.append(inv_view), views_std.append(inv_view_std)
        watchtimes_mean.append(inv_watch), watchtimes_std.append(inv_watch_std)
        impressions_mean.append(inv_impression), impressions_std.append(inv_impression_std)
    data.update([("ctr_mean",ctr_mean),("view_mean",views_mean), ("watchtimes_mean",watchtimes_mean), ("impressions_mean",impressions_mean)])
    data.update([("ctr_std",ctr_std),("views_std",views_std), ("watchtimes_std",watchtimes_std), ("impressions_std",impressions_std)])
    data = pd.DataFrame(data)
    data.to_csv(f"interval/interval_{column}.csv", index=False)
    return data

def box_data(df, column, box_interval):
    data = {f"interval_{column}": box_interval[1:]}
    ctr_mean, views_mean, watchtimes_mean, impressions_mean = [],[],[],[]
    ctr_std, views_std, watchtimes_std, impressions_std = [],[],[],[]
    for i in range(len(box_interval)-1):
        bound= box_interval[i] 
        upper_bound = box_interval[i+1]
        mask = (df[f"{column}"] >= bound) & (df[f"{column}"] < upper_bound)

        inv_ctr = np.mean(df[mask]["CTR"])
        inv_ctr_std = np.std(df[mask]["CTR"], ddof=1)
        
        inv_view = np.mean(df[mask]["views"])
        inv_view_std = np.std((df[mask]["views"]), ddof=1)
        
        inv_watch = np.mean(df[mask]["watchtimes(hr)"])
        inv_watch_std = np.std(df[mask]["watchtimes(hr)"], ddof=1)
        
        inv_impression = np.mean(df[mask]["impressions"])
        inv_impression_std = np.std(df[mask]["impressions"], ddof=1)
        
        ctr_mean.append(inv_ctr), ctr_std.append(inv_ctr_std)
        views_mean.append(inv_view), views_std.append(inv_view_std)
        watchtimes_mean.append(inv_watch), watchtimes_std.append(inv_watch_std)
        impressions_mean.append(inv_impression), impressions_std.append(inv_impression_std)
    data.update([("ctr_mean",ctr_mean),("view_mean",views_mean), ("watchtimes_mean",watchtimes_mean), ("impressions_mean",impressions_mean)])
    data.update([("ctr_std",ctr_std),("views_std",views_std), ("watchtimes_std",watchtimes_std), ("impressions_std",impressions_std)])
    data = pd.DataFrame(data)
    data.to_csv(f"interval/boxinterval_{column}.csv", index=False)
    return data

df = pd.read_csv("表格資料_edit_thumbnails.csv")


wcag = box_data(df, 'wcag_contrast', wcag_box)
contrast = box_data(df, 'contrast', contrast_box)
f1_score = box_data(df, "f1-score", f1_box)
text_len = box_data(df, "thumbnail_text_len", textlen_box) 
colorfulness = box_data(df, "colorfulness", colorfulness_box)
likes = box_data(df, "likes", likes_box)
watch_percent = box_data(df, "watch_percent", watch_percent_box)

In [19]:
text_len

Unnamed: 0,interval_thumbnail_text_len,ctr_mean,view_mean,watchtimes_mean,impressions_mean,ctr_std,views_std,watchtimes_std,impressions_std
0,96.0,3.170588,160.788235,1.08606,416.470588,5.285919,224.543828,4.234635,570.442906
1,195.0,2.068736,217.057471,0.772218,570.425287,2.115178,240.273166,0.739657,1233.111659
2,250.0,2.069157,229.228916,0.755558,533.722892,2.006111,249.196579,0.777705,1064.645164
3,481.0,1.822073,359.731707,1.226591,593.573171,1.901596,644.202887,1.789575,1086.655749
