In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")

In [10]:
users = pd.read_csv("/Users/samsanovicekaterina/dl.25/data/users.tsv", sep="\t")
history = pd.read_csv("/Users/samsanovicekaterina/dl.25/data/history.tsv", sep="\t")
validate = pd.read_csv("/Users/samsanovicekaterina/dl.25/data/validate_new.tsv", sep="\t")
answers = pd.read_csv("/Users/samsanovicekaterina/dl.25/data/validate_answers.tsv", sep="\t")


In [11]:
users['age_group'] = users['age'].replace(0, -1)
users['sex'] = users['sex'].astype('category')
users['city_id'] = users['city_id'].astype('category')


In [12]:
users

Unnamed: 0,user_id,sex,age,city_id,age_group
0,0,2,19,0,19
1,1,1,0,1,-1
2,2,2,24,2,24
3,3,1,20,3,20
4,4,2,29,4,29
...,...,...,...,...,...
27764,27764,1,38,295,38
27765,27765,2,30,79,30
27766,27766,2,21,1953,21
27767,27767,2,17,0,17


In [13]:
user_agg = history.groupby("user_id").agg(
    total_impressions=("hour", "count"),
    mean_cpm=("cpm", "mean"),          
    median_cpm=("cpm", "median"),       
    std_cpm=("cpm", "std"),         
    cpm_25percentile=("cpm", lambda x: x.quantile(0.25)),  
    cpm_75percentile=("cpm", lambda x: x.quantile(0.75)), 
    
    unique_publishers=("publisher", "nunique"),
    most_common_publisher=("publisher", lambda x: x.mode()[0] if len(x.mode()) > 0 else -1),
    
    mean_hour_of_day=("hour", lambda x: (x % 24).mean()),
    std_hour_of_day=("hour", lambda x: (x % 24).std()),
    morning_impressions=("hour", lambda x: ((x % 24) < 12).sum()),                    
    afternoon_impressions=("hour", lambda x: (((x % 24) >= 12) & ((x % 24) < 18)).sum()),  
    evening_impressions=("hour", lambda x: (((x % 24) >= 18) & ((x % 24) < 23)).sum()),  
    night_impressions=("hour", lambda x: ((x % 24) >= 23).sum())                   
).reset_index()

user_agg["user_id"] = user_agg["user_id"].astype(int)
user_agg["cpm_iqr"] = user_agg["cpm_75percentile"] - user_agg["cpm_25percentile"]

user_agg["morning_ratio"] = user_agg["morning_impressions"] / user_agg["total_impressions"]
user_agg["afternoon_ratio"] = user_agg["afternoon_impressions"] / user_agg["total_impressions"]
user_agg["evening_ratio"] = user_agg["evening_impressions"] / user_agg["total_impressions"]
user_agg["night_ratio"] = user_agg["night_impressions"] / user_agg["total_impressions"]
user_agg = user_agg.fillna(0)

In [14]:
user_agg = user_agg.drop(['cpm_25percentile','cpm_75percentile'],axis =1 )

In [15]:
user_agg = user_agg.drop(['morning_impressions','afternoon_impressions','evening_impressions'],axis =1 )

In [16]:
user_agg.head()

Unnamed: 0,user_id,total_impressions,mean_cpm,median_cpm,std_cpm,unique_publishers,most_common_publisher,mean_hour_of_day,std_hour_of_day,night_impressions,cpm_iqr,morning_ratio,afternoon_ratio,evening_ratio,night_ratio
0,0,2,2053.83,2053.83,1765.193084,1,1,13.5,4.949747,0,1248.18,0.5,0.5,0.0,0.0
1,1,82,124.068049,90.0,133.140881,1,1,13.670732,4.033976,1,71.4525,0.402439,0.402439,0.182927,0.012195
2,3,8,370.88,288.13,171.699728,2,1,16.875,5.43632,1,175.075,0.125,0.375,0.375,0.125
3,4,132,44.627955,30.16,33.331742,2,2,12.69697,5.278415,0,12.5,0.477273,0.30303,0.219697,0.0
4,5,9,91.055556,42.5,116.958979,1,7,13.111111,2.027588,0,70.0,0.111111,0.777778,0.111111,0.0


In [17]:
all_users = users[["user_id"]].copy()
all_users["user_id"] = all_users["user_id"].astype(int)

user_agg = all_users.merge(user_agg, on="user_id", how="left")
user_agg.fillna(0, inplace=True)
user_agg # для пользователей, у которых нет истории зануляем признаки

Unnamed: 0,user_id,total_impressions,mean_cpm,median_cpm,std_cpm,unique_publishers,most_common_publisher,mean_hour_of_day,std_hour_of_day,night_impressions,cpm_iqr,morning_ratio,afternoon_ratio,evening_ratio,night_ratio
0,0,2.0,2053.830000,2053.830,1765.193084,1.0,1.0,13.500000,4.949747,0.0,1248.1800,0.500000,0.500000,0.000000,0.000000
1,1,82.0,124.068049,90.000,133.140881,1.0,1.0,13.670732,4.033976,1.0,71.4525,0.402439,0.402439,0.182927,0.012195
2,2,0.0,0.000000,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.000000,0.000000,0.000000
3,3,8.0,370.880000,288.130,171.699728,2.0,1.0,16.875000,5.436320,1.0,175.0750,0.125000,0.375000,0.375000,0.125000
4,4,132.0,44.627955,30.160,33.331742,2.0,2.0,12.696970,5.278415,0.0,12.5000,0.477273,0.303030,0.219697,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27764,27764,43.0,210.409767,170.000,165.506452,1.0,1.0,18.581395,6.740894,16.0,112.8300,0.209302,0.023256,0.395349,0.372093
27765,27765,79.0,146.901013,110.640,126.942353,2.0,2.0,9.898734,5.592107,1.0,110.5200,0.620253,0.215190,0.151899,0.012658
27766,27766,54.0,159.045741,123.525,122.679378,1.0,1.0,13.481481,6.447877,1.0,91.3750,0.388889,0.222222,0.370370,0.018519
27767,27767,1.0,143.880000,143.880,0.000000,1.0,1.0,11.000000,0.000000,0.0,0.0000,1.000000,0.000000,0.000000,0.000000


In [18]:
history_with_city = history.merge(users[['user_id', 'city_id', 'sex', 'age']], on='user_id', how='left')

publisher_agg = history_with_city.groupby("publisher").agg(
    total_impressions_publisher = ("user_id", "count"),
    unique_users=("user_id", "nunique"),

    mean_cpm_p=("cpm", "mean"),
    median_cpm_p=("cpm", "median"),
    std_cpm_p=("cpm", "std"),
    unique_cities=("city_id", "nunique"),  
    
    avg_user_age=("age", "mean"),
    male_ratio=("sex", lambda x: (x == 1).sum() / len(x) if len(x) > 0 else 0),

    mean_hour_of_day=("hour", lambda x: (x % 24).mean()),
    peak_hour=("hour", lambda x: (x % 24).mode()[0] if len((x % 24).mode()) > 0 else -1),
    
).reset_index()

publisher_agg["publisher"] = publisher_agg["publisher"].astype(int)
publisher_agg["impressions_per_user"] = (
    publisher_agg["total_impressions_publisher"] / publisher_agg["unique_users"]
)

publisher_agg["cpm_cv_p"] = (
    publisher_agg["std_cpm_p"] / (publisher_agg["mean_cpm_p"] + 1)
)

publisher_agg["geo_diversity"] = (
    publisher_agg["unique_cities"] / publisher_agg["unique_users"]
)


publisher_agg["is_premium"] = (
    (publisher_agg["mean_cpm_p"] > publisher_agg["mean_cpm_p"].quantile(0.75)).astype(int)
)
publisher_agg["is_high_volume"] = (
    (publisher_agg["total_impressions_publisher"] > publisher_agg["total_impressions_publisher"].quantile(0.75)).astype(int)
)

publisher_agg = publisher_agg.fillna(0)
publisher_agg

Unnamed: 0,publisher,total_impressions_publisher,unique_users,mean_cpm_p,median_cpm_p,std_cpm_p,unique_cities,avg_user_age,male_ratio,mean_hour_of_day,peak_hour,impressions_per_user,cpm_cv_p,geo_diversity,is_premium,is_high_volume
0,1,692535,15426,177.567497,105.02,374.439966,1813,28.496278,0.526809,13.307508,20,44.89401,2.09691,0.117529,0,1
1,2,273037,5939,203.134623,126.52,406.371467,779,28.191465,0.554119,13.424931,21,45.973565,1.990703,0.131167,1,1
2,3,72124,3011,195.899285,100.15,1032.19973,709,31.087502,0.559287,13.363651,21,23.953504,5.242273,0.23547,1,1
3,4,2286,739,92.164563,63.125,91.68717,196,30.665354,0.55818,12.011374,15,3.093369,0.984142,0.265223,0,0
4,5,7263,580,183.900573,108.92,364.131543,182,27.006747,0.477626,12.929781,9,12.522414,1.969337,0.313793,0,0
5,6,5935,749,87.411821,45.0,103.721364,233,26.049368,0.489806,13.1754,19,7.923899,1.173162,0.311081,0,0
6,7,66134,5590,244.468823,130.0,509.083841,980,32.330344,0.456452,13.852527,13,11.830769,2.073925,0.175313,1,1
7,8,3625,621,115.163796,82.45,99.601058,252,32.352276,0.647448,12.639172,7,5.837359,0.857419,0.405797,0,0
8,9,10165,2298,98.388453,66.93,95.318107,589,34.6909,0.731038,12.406788,8,4.423412,0.959046,0.25631,0,1
9,10,4232,1317,108.06229,60.0,132.91036,399,36.258743,0.63138,12.433365,8,3.213364,1.218665,0.302961,0,0


In [19]:
validate['duration'] = validate['hour_end'] - validate['hour_start']
validate['hour_start_of_day'] = validate['hour_start'] % 24
validate['hour_end_of_day'] = validate['hour_end'] % 24

In [20]:
validate['publishers'] = validate['publishers'].apply(lambda x: [int(p) for p in str(x).split(',')])
validate['user_ids'] = validate['user_ids'].apply(lambda x: [int(p) for p in str(x).split(',')])

In [21]:
validate

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,duration,hour_start_of_day,hour_end_of_day
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,95,2,1
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,6,23,5
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,20,5,1
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,82,23,9
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,238,8,6
...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,59,15,2
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,4,20,0
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,5,2,7
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,237,20,17


теперь из добавленного мы видим, когда в каком часу кампания началась/закончилась, а также кампании и user из них теперь кортежи, а не строки.

In [22]:
validate_enriched = validate.copy()
validate_enriched = validate.drop(['duration'], axis=1)
validate_enriched['campaign_duration'] = validate['hour_end'] - validate['hour_start']

validate_enriched['num_publishers'] = validate['publishers'].apply(
    lambda x: len(str(x).split(','))
)

In [23]:
validate_enriched

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,hour_start_of_day,hour_end_of_day,campaign_duration,num_publishers
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,2,1,95,2
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,23,5,6,2
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,5,1,20,6
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,23,9,82,2
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,8,6,238,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,15,2,59,5
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,20,0,4,2
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,2,7,5,2
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,20,17,237,3


In [24]:
def calculate_time_coverage(start_hour, end_hour):
    start_h = start_hour % 24
    end_h = end_hour % 24
    duration = end_hour - start_hour
    
    if duration <= 24:
        hours = [(start_hour + i) % 24 for i in range(duration + 1)]
    else:
        full_days = duration // 24
        remaining_hours = duration % 24
        
        morning_hours = 6 * full_days 
        afternoon_hours = 6 * full_days 
        evening_hours = 5 * full_days 
        night_hours = 7 * full_days 
    
        hours = [(start_hour + i) % 24 for i in range(remaining_hours + 1)]
        
        morning_hours += sum(1 for h in hours if 6 <= h < 12)
        afternoon_hours += sum(1 for h in hours if 12 <= h < 18)
        evening_hours += sum(1 for h in hours if 18 <= h < 23)
        night_hours += sum(1 for h in hours if h >= 23 or h < 6)
        
        total = duration + 1
        return {
            'morning': morning_hours / total,
            'afternoon': afternoon_hours / total,
            'evening': evening_hours / total,
            'night': night_hours / total
        }
    
    morning = sum(1 for h in hours if 6 <= h < 12)
    afternoon = sum(1 for h in hours if 12 <= h < 18)
    evening = sum(1 for h in hours if 18 <= h < 23)
    night = sum(1 for h in hours if h >= 23 or h < 6)
    
    total = len(hours)
    return {
        'morning': morning / total,
        'afternoon': afternoon / total,
        'evening': evening / total,
        'night': night / total
    }

time_coverage = validate.apply(
    lambda row: calculate_time_coverage(row['hour_start'], row['hour_end']), 
    axis=1
)

validate_enriched['coverage_morning'] = time_coverage.apply(lambda x: x['morning'])
validate_enriched['coverage_afternoon'] = time_coverage.apply(lambda x: x['afternoon'])
validate_enriched['coverage_evening'] = time_coverage.apply(lambda x: x['evening'])
validate_enriched['coverage_night'] = time_coverage.apply(lambda x: x['night'])
validate_enriched

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,hour_start_of_day,hour_end_of_day,campaign_duration,num_publishers,coverage_morning,coverage_afternoon,coverage_evening,coverage_night
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,2,1,95,2,0.250000,0.250000,0.208333,0.291667
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,23,5,6,2,0.000000,0.000000,0.000000,1.000000
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,5,1,20,6,0.285714,0.285714,0.238095,0.190476
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,23,9,82,2,0.265060,0.216867,0.180723,0.337349
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,8,6,238,4,0.246862,0.251046,0.209205,0.292887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,15,2,59,5,0.200000,0.250000,0.250000,0.300000
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,20,0,4,2,0.000000,0.000000,0.600000,0.400000
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,2,7,5,2,0.333333,0.000000,0.000000,0.666667
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,20,17,237,3,0.252101,0.252101,0.201681,0.294118


In [25]:
def agg_aud_features(user_list, user_agg):
    aud = user_agg[user_agg["user_id"].isin(user_list)]
    
    if len(aud) == 0:
        return pd.Series({
            "aud_size": 0,
            "aud_mean_imp": 0.0,
            "aud_median_imp": 0.0,
            "aud_std_imp": 0.0,
            "aud_mean_cpm": 0.0,
            "aud_median_cpm": 0.0,
            "aud_std_cpm": 0.0,
            "aud_mean_pub_diversity": 0.0,
            "aud_mean_hour_of_day": 12.0,
            "aud_morning_ratio": 0.0,
            "aud_afternoon_ratio": 0.0,
            "aud_evening_ratio": 0.0,
            "aud_night_ratio": 0.0,
            "aud_coverage": 0.0  
        })
    
    return pd.Series({
        "aud_size": len(aud),
    
        "aud_mean_imp": aud["total_impressions"].mean(),
        "aud_median_imp": aud["total_impressions"].median(),
        "aud_std_imp": aud["total_impressions"].std(),
        
        
        "aud_mean_cpm": aud["mean_cpm"].mean(),
        "aud_median_cpm": aud["median_cpm"].mean(),
        "aud_std_cpm": aud["std_cpm"].mean(),
        
        
        "aud_mean_pub_diversity": aud["unique_publishers"].mean(),
        
        "aud_mean_hour_of_day": aud["mean_hour_of_day"].mean(),
        "aud_morning_ratio": aud["morning_ratio"].mean(),
        "aud_afternoon_ratio": aud["afternoon_ratio"].mean(),
        "aud_evening_ratio": aud["evening_ratio"].mean(),
        "aud_night_ratio": aud["night_ratio"].mean(),
    
        "aud_coverage": len(aud) / len(user_list) if len(user_list) > 0 else 0.0
    })


aud_features = validate['user_ids'].apply(lambda x: agg_aud_features(x, user_agg))
validate_enriched = pd.concat([validate_enriched, aud_features], axis=1)
validate_enriched

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,hour_start_of_day,...,aud_mean_cpm,aud_median_cpm,aud_std_cpm,aud_mean_pub_diversity,aud_mean_hour_of_day,aud_morning_ratio,aud_afternoon_ratio,aud_evening_ratio,aud_night_ratio,aud_coverage
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,2,...,277.192969,223.064971,203.729050,1.386674,12.481497,0.372664,0.281239,0.250366,0.036445,1.0
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,23,...,179.039494,129.313120,175.241423,1.749275,13.239540,0.391982,0.296798,0.269715,0.039331,1.0
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,5,...,281.236039,211.638046,236.403887,1.365991,12.312116,0.361083,0.270984,0.253072,0.037159,1.0
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,23,...,253.803214,204.578409,185.725720,1.384091,12.454851,0.374482,0.292482,0.242742,0.033476,1.0
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,8,...,327.261172,271.450901,222.585375,1.379404,12.360859,0.335105,0.256184,0.275214,0.035259,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,15,...,251.821533,211.496372,176.278544,1.336957,12.330189,0.332056,0.274219,0.267045,0.031571,1.0
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,20,...,231.280577,180.850279,176.744981,1.301653,12.201580,0.340010,0.273330,0.251126,0.038426,1.0
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,2,...,178.998032,130.320305,177.988827,1.677557,13.393658,0.378731,0.306166,0.274423,0.039259,1.0
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,20,...,250.687855,204.051174,178.749652,1.364463,12.328043,0.337734,0.271973,0.267410,0.036932,1.0


In [26]:
validate_enriched = validate_enriched.drop(['aud_coverage', 'aud_size'], axis=1)

In [27]:
def agg_publisher_features(pub_list, publisher_agg):
    pubs = publisher_agg[publisher_agg["publisher"].isin(pub_list)]

    if len(pubs) == 0:
        return pd.Series({
            "pub_mean_impressions": 0.0,
            "pub_mean_unique_users": 0.0,
            "pub_mean_cpm": 0.0,
            "pub_median_cpm": 0.0,
            "pub_std_cpm": 0.0,
            "pub_mean_unique_cities": 0.0,
            "pub_mean_age": 0.0,
            "pub_mean_male_ratio": 0.0,
            "pub_mean_hour_of_day": 12.0,
            "pub_mean_imp_per_user": 0.0,
            "pub_premium_ratio": 0.0,
            "pub_high_volume_ratio": 0.0
        })
    
    return pd.Series({
        
        "pub_mean_impressions": pubs["total_impressions_publisher"].mean(),
        "pub_mean_unique_users": pubs["unique_users"].mean(),
        
        
        "pub_mean_cpm": pubs["mean_cpm_p"].mean(),
        "pub_median_cpm": pubs["median_cpm_p"].mean(),
        "pub_std_cpm": pubs["std_cpm_p"].mean(),
        
    
        "pub_mean_unique_cities": pubs["unique_cities"].mean(),    
        "pub_mean_age": pubs["avg_user_age"].mean(),
        "pub_mean_male_ratio": pubs["male_ratio"].mean(),
        
        
        "pub_mean_hour_of_day": pubs["mean_hour_of_day"].mean(),
        "pub_mean_imp_per_user": pubs["impressions_per_user"].mean(),
        
        "pub_premium_ratio": pubs["is_premium"].mean(),
        "pub_high_volume_ratio": pubs["is_high_volume"].mean()
    })


pub_features = validate['publishers'].apply(lambda x: agg_publisher_features(x, publisher_agg))
validate_enriched = pd.concat([validate_enriched, pub_features], axis=1)
validate_enriched

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,hour_start_of_day,...,pub_mean_cpm,pub_median_cpm,pub_std_cpm,pub_mean_unique_cities,pub_mean_age,pub_mean_male_ratio,pub_mean_hour_of_day,pub_mean_imp_per_user,pub_premium_ratio,pub_high_volume_ratio
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,2,...,209.945895,110.360000,418.305925,516.500000,30.961519,0.391240,13.949551,11.129670,0.500000,0.500000
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,23,...,150.556096,89.325000,556.491794,367.500000,29.497833,0.659745,13.436928,14.426752,0.500000,0.500000
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,5,...,145.883084,84.770000,377.222161,652.833333,28.810551,0.527203,12.946205,23.472567,0.333333,0.666667
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,23,...,187.463175,110.295000,327.963948,941.500000,30.106433,0.610324,12.684560,26.600548,0.500000,0.500000
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,8,...,183.274850,104.405000,503.831149,938.500000,31.066600,0.547499,13.290715,21.628911,0.500000,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,15,...,161.700159,95.257000,288.545429,416.800000,30.886136,0.623262,13.347756,16.445638,0.400000,0.400000
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,20,...,124.622867,78.260000,217.315588,390.500000,24.095732,0.332615,12.323577,25.236782,0.500000,0.500000
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,2,...,193.517598,117.720000,385.251505,480.500000,27.599106,0.515873,13.177356,29.247989,0.500000,0.500000
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,20,...,208.390314,120.513333,429.965091,1190.666667,29.672696,0.512460,13.528322,34.232781,0.666667,1.000000


In [28]:
validate_enriched['cpm_x_duration'] = validate_enriched['cpm'] * validate_enriched['campaign_duration']
validate_enriched['cpm_x_audience'] = validate_enriched['cpm'] * validate_enriched['audience_size']
validate_enriched['cpm_per_hour'] = validate_enriched['cpm'] / (validate_enriched['campaign_duration'] + 1)

validate_enriched['cpm_deviation'] = np.abs(
    validate_enriched['cpm'] - validate_enriched['aud_mean_cpm']
)
validate_enriched['aud_pub_quality'] = (
    validate_enriched['aud_mean_imp'] * validate_enriched['pub_mean_cpm']
)
validate_enriched['campaign_intensity'] = (
    validate_enriched['cpm'] / 
    (validate_enriched['audience_size'] * validate_enriched['campaign_duration'] + 1)
)


In [29]:
validate_enriched = validate_enriched.fillna(0)
validate_enriched = validate_enriched.replace([np.inf, -np.inf], 0)
validate_enriched

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,cluster_0,cluster_1,cluster_2,hour_start_of_day,...,pub_mean_hour_of_day,pub_mean_imp_per_user,pub_premium_ratio,pub_high_volume_ratio,cpm_x_duration,cpm_x_audience,cpm_per_hour,cpm_deviation,aud_pub_quality,campaign_intensity
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",0.680483,0.101784,0.158447,2,...,13.949551,11.129670,0.500000,0.500000,20900.0,419320.0,2.291667,57.192969,8516.907959,0.001215
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",0.436232,0.435507,0.126087,23,...,13.436928,14.426752,0.500000,0.500000,1872.0,430560.0,44.571429,132.960506,19420.318158,0.037677
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",0.668919,0.106982,0.146396,5,...,12.946205,23.472567,0.333333,0.666667,1400.0,62160.0,3.333333,211.236039,6415.570042,0.003941
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",0.704545,0.095455,0.143182,23,...,12.684560,26.600548,0.500000,0.500000,19680.0,105600.0,2.891566,13.803214,7248.434091,0.006652
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",0.625339,0.079268,0.197154,8,...,13.290715,21.628911,0.500000,0.750000,62356.0,386712.0,1.096234,65.261172,6510.727298,0.000746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",0.652174,0.089674,0.163043,15,...,13.347756,16.445638,0.400000,0.400000,7493.0,46736.0,2.116667,124.821533,6015.421677,0.005849
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",0.663223,0.113636,0.126033,20,...,12.323577,25.236782,0.500000,0.500000,360.0,43560.0,18.000000,141.280577,5208.411890,0.046464
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",0.518466,0.369318,0.110795,2,...,13.177356,29.247989,0.500000,0.500000,610.0,85888.0,20.333333,56.998032,22425.775835,0.034649
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",0.661157,0.081818,0.171074,20,...,13.528322,34.232781,0.666667,1.000000,32706.0,166980.0,0.579832,112.687855,7325.866776,0.000481


In [30]:
validate_enriched.to_parquet("/Users/samsanovicekaterina/dl.25/parquet_files/validate_agg_f.parquet", index=False)

In [31]:
def agg_aud_features(user_ids, user_agg):
    aud = user_agg[user_agg["user_id"].isin(user_ids)]

    if len(aud) == 0:
        return pd.Series({
            "aud_mean_imp": 0.0,
            "aud_median_imp": 0.0,
            "aud_mean_cpm": 0.0,
            "aud_median_cpm": 0.0,
            "aud_max_cpm": 0.0
        })

    return pd.Series({
        "aud_mean_imp": aud["total_impressions"].mean(),
        "aud_median_imp": aud["total_impressions"].median(),
        "aud_mean_cpm": aud["mean_cpm"].mean(),
        "aud_median_cpm": aud["median_cpm"].mean(),
        "aud_max_cpm": aud["max_cpm"].max()
    })


In [32]:
audience_features = validate["user_ids"].apply( lambda ids: agg_aud_features(ids, user_agg))
audience_features

KeyError: 'max_cpm'

In [None]:
validate = pd.concat([validate, audience_features], axis=1)


In [None]:
validate.isna().sum()


cpm                  0
hour_start           0
hour_end             0
publishers           0
audience_size        0
user_ids             0
duration             0
hour_start_of_day    0
hour_end_of_day      0
aud_mean_imp         0
aud_median_imp       0
aud_mean_cpm         0
aud_median_cpm       0
aud_max_cpm          0
dtype: int64

In [None]:
validate.describe()


Unnamed: 0,cpm,hour_start,hour_end,audience_size,duration,hour_start_of_day,hour_end_of_day,aud_mean_imp,aud_median_imp,aud_mean_cpm,aud_median_cpm,aud_max_cpm
count,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0,1008.0
mean,162.425595,1065.046627,1166.225198,1090.087302,101.178571,11.618056,11.296627,46.983003,23.519345,294.600148,242.394174,43537.312073
std,112.192984,192.984789,192.168496,613.557243,126.397082,6.896882,7.019077,33.569742,30.9483,173.698657,149.419223,49566.884133
min,30.0,747.0,753.0,300.0,2.0,0.0,0.0,1.248018,1.0,54.372712,42.985823,851.4
25%,79.0,906.0,1019.0,575.0,8.0,5.0,5.0,38.118972,14.0,225.531519,181.520008,14428.72
50%,130.0,1048.0,1184.0,960.0,36.5,12.0,11.0,44.282786,17.0,270.270198,220.063828,32154.56
75%,220.0,1214.25,1325.25,1464.0,169.75,17.0,17.0,48.149986,19.5,302.469016,246.422084,56579.07
max,475.0,1485.0,1488.0,2500.0,503.0,23.0,23.0,198.264444,174.0,1138.055499,978.583037,209053.98


In [None]:
validate

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,duration,hour_start_of_day,hour_end_of_day,aud_mean_imp,aud_median_imp,aud_mean_cpm,aud_median_cpm,aud_max_cpm,aud_mean_imp.1,aud_median_imp.1,aud_mean_cpm.1,aud_median_cpm.1,aud_max_cpm.1
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",95,2,1,6.663694,2.0,177.394734,154.948940,56579.07,6.663694,2.0,177.394734,154.948940,56579.07
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",6,23,5,19.973913,16.0,163.268813,123.862554,36395.60,19.973913,16.0,163.268813,123.862554,36395.60
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",20,5,1,6.890766,2.0,162.120671,137.818840,21077.38,6.890766,2.0,162.120671,137.818840,21077.38
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",82,23,9,6.465909,2.0,161.117249,137.290670,56579.07,6.465909,2.0,161.117249,137.290670,56579.07
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",238,8,6,5.533875,1.0,190.072762,165.578157,38282.14,5.533875,1.0,190.072762,165.578157,38282.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516,116.0,878,883,"[2, 3, 7]",656,"[4, 38, 179, 180, 222, 346, 404, 408, 430, 449...",5,14,19,19.661585,16.0,167.358729,128.366052,21077.38,19.661585,16.0,167.358729,128.366052,21077.38
517,138.0,1395,1442,"[1, 2, 5, 14, 16]",808,"[42, 44, 66, 90, 221, 325, 348, 354, 401, 404,...",47,3,2,6.370050,2.0,146.303113,124.898100,14697.84,6.370050,2.0,146.303113,124.898100,14697.84
518,143.0,1164,1184,"[2, 3, 7, 14, 16, 17]",1500,"[0, 36, 80, 85, 114, 140, 148, 153, 176, 189, ...",20,12,8,6.762667,2.0,155.806314,132.065047,38282.14,6.762667,2.0,155.806314,132.065047,38282.14
519,130.0,883,997,"[1, 7, 21]",348,"[121, 304, 402, 417, 510, 545, 548, 555, 568, ...",114,19,13,6.589080,2.0,156.621980,138.905043,21077.38,6.589080,2.0,156.621980,138.905043,21077.38


In [None]:
validate

Unnamed: 0,cpm,hour_start,hour_end,publishers,audience_size,user_ids,duration,hour_start_of_day,hour_end_of_day,aud_mean_imp,aud_median_imp,aud_mean_cpm,aud_median_cpm,aud_max_cpm
0,220.0,1058,1153,"[7, 17]",1906,"[12, 44, 46, 50, 58, 71, 93, 122, 134, 143, 17...",95,2,1,43.123815,18.0,294.662464,237.123165,56579.07
1,312.0,1295,1301,"[3, 18]",1380,"[29, 81, 98, 102, 165, 167, 195, 205, 218, 231...",6,23,5,129.271605,105.0,179.429558,129.594847,36395.60
2,70.0,1229,1249,"[1, 2, 3, 9, 15, 21]",888,"[12, 23, 25, 29, 45, 85, 92, 124, 156, 190, 27...",20,5,1,47.682540,19.0,304.929917,229.468358,209053.98
3,240.0,1295,1377,"[1, 14]",440,"[44, 122, 187, 209, 242, 255, 312, 345, 382, 4...",82,23,9,40.995181,18.0,269.092564,216.902410,56579.07
4,262.0,752,990,"[1, 3, 7, 8]",1476,"[15, 24, 30, 43, 50, 53, 96, 105, 159, 168, 18...",238,8,6,39.394440,14.0,362.913216,301.022938,38282.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,127.0,1095,1154,"[2, 7, 12, 15, 16]",368,"[38, 50, 143, 237, 456, 649, 653, 698, 881, 89...",59,15,2,41.111111,14.0,278.289261,233.725721,19488.36
1004,90.0,1052,1056,"[2, 21]",484,"[1, 4, 61, 212, 230, 348, 356, 372, 429, 431, ...",4,20,0,46.288330,19.0,256.155147,200.300995,18233.68
1005,122.0,1058,1063,"[2, 5]",704,"[1, 23, 29, 104, 136, 176, 201, 206, 250, 269,...",5,2,7,116.049787,95.0,179.252653,130.505683,36853.28
1006,138.0,932,1169,"[1, 2, 7]",1210,"[67, 73, 80, 146, 182, 202, 212, 254, 307, 377...",237,20,17,38.460217,13.0,274.260674,223.238626,8677.35


In [34]:
# validate.to_parquet("/content/validate_features.parquet", index=False)
# user_agg.to_parquet("/content/user_aggregates.parquet", index=False)
# publisher_agg.to_parquet("/content/publisher_aggregates.parquet", index=False)

validate.to_parquet("/Users/samsanovicekaterina/auc_forecast/parquet_files/validate_agg_f.parquet", index=False)
user_agg.to_parquet("/Users/samsanovicekaterina/auc_forecast/parquet_files/user_aggregates_f.parquet", index=False)
publisher_agg.to_parquet("/Users/samsanovicekaterina/auc_forecast/parquet_files/publisher_agg_f.parquet", index=False)