In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn

In [None]:
data = pd.read_csv("./data/nyt7.csv")
data

In [None]:
data.Age.unique()

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
data[data.Age == 0].groupby("Age").count()

In [None]:
data_not_null_age = data[(data.Age > 0)]
data_not_null_age.Age.hist()

In [None]:
data_not_null_age.Signed_In.unique()

In [None]:
def group_by_age(age):
    match age:
        case x if x < 18:
            return "0-18"
        case x if 18 <= x <= 24:
            return "18-24"
        case x if 25 <= x <= 34:
            return "25-34"
        case x if 35 <= x <= 44:
            return "35-44"
        case x if 45 <= x <= 54:
            return "45-54"
        case x if 55 <= x <= 64:
            return "55-64"
        case x if x >= 65:
            return "65+"


def check_gender(gender):
    if gender == 0:
        return "Female"
    else:
        return "Male"


group_data = data_not_null_age
group_data['AgeGroup'] = data_not_null_age.Age.apply(lambda row: group_by_age(row))
group_data = group_data.sort_values(by=['AgeGroup'])
group_data

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
seaborn.barplot(data=group_data, x=group_data.AgeGroup, y=group_data.Clicks)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
seaborn.barplot(data=group_data,
                x=group_data.AgeGroup,
                y=group_data.Clicks,
                hue=group_data.Gender)

In [None]:
data_ctr = group_data.groupby('AgeGroup', as_index=False).sum()
data_ctr['CTR'] = data_ctr.Clicks / data_ctr.Impressions
data_ctr

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
seaborn.barplot(data=data_ctr,
                x=data_ctr.AgeGroup,
                y=data_ctr.CTR)

In [None]:
lab_data = group_data[group_data.Age > 18].drop(columns="Signed_In")
lab_data

In [None]:
lab_data["Gender"] = lab_data.Gender.apply(lambda row: check_gender(row))
lab_data['CTR'] = lab_data.Clicks / lab_data.Impressions
lab_data

In [None]:
lab_data.groupby('AgeGroup').CTR.mean()

In [None]:
result_data = lab_data.groupby("AgeGroup").mean()
result_data = result_data.drop(columns="Age")
result_data.rename(columns={"Impressions": "Impressions mean", "Clicks": "Clicks mean"}, inplace=True)
result_data

In [None]:
result_data.CTR.plot(kind="bar")

In [None]:
result_data["CTRQuantile"] = lab_data.groupby("AgeGroup").CTR.quantile(q=0.95)
result_data.CTRQuantile.plot(kind="bar")

In [None]:
result_data["MaxImpressions"] = lab_data.groupby("AgeGroup").Impressions.max()
result_data["MaxImpressions"].plot(kind="bar")

In [None]:
result_data["MaxClicks"] = lab_data.groupby("AgeGroup").Clicks.max()
result_data["MaxClicks"].plot(kind="bar")

In [None]:
result_data["ImpressionsDispersion"] = lab_data.groupby("AgeGroup").Impressions.std()
result_data["ImpressionsDispersion"].plot(kind="bar")

In [None]:
result_data["ClicksDispersion"] = lab_data.groupby("AgeGroup").Clicks.std()
result_data["ClicksDispersion"].plot(kind="bar")

In [None]:
def analyze_day(day):
    data_day = pd.read_csv(f"./data/nyt{day}.csv")
    data_day_no_null_age = data_day[(data_day.Age > 0) & (data_day.Signed_In == 1)]
    grouped_age_data = data_day_no_null_age
    grouped_age_data["age_group"] = grouped_age_data.Age.apply(lambda x: group_by_age(x))

    lab_data_day = grouped_age_data[grouped_age_data.Age > 18]
    lab_data_day = lab_data_day.drop(columns="Signed_In")
    lab_data_day["Gender"] = lab_data_day.Gender.apply(lambda x: check_gender(x))
    lab_data_day["CTR"] = lab_data_day.Clicks / lab_data_day.Impressions

    result_data_day = lab_data_day.groupby("age_group").mean()
    result_data_day = result_data_day.drop(columns="Age")
    result_data_day.rename(columns={"Impressions": "Impressions mean", "Clicks": "Clicks mean"}, inplace=True)
    result_data_day["CTR quantile"] = lab_data_day.groupby("age_group").CTR.quantile(q=0.95)
    result_data_day["Impressions max"] = lab_data_day.groupby("age_group").Impressions.max()
    result_data_day["Clicks max"] = lab_data_day.groupby("age_group").Clicks.max()
    result_data_day["Impressions dispersion"] = lab_data_day.groupby("age_group").Impressions.std()
    result_data_day["Clicks dispersion"] = lab_data_day.groupby("age_group").Clicks.std()

    return result_data_day

In [None]:
analyze_day(2)

In [None]:
analyze_day(3)

In [None]:
analyze_day(4)

In [None]:
ax = result_data.CTR.plot()
analyze_day(2).CTR.plot(ax=ax)