In [1]:
import csv
import tqdm
import numpy as np
import pandas as pd
import zipfile
import os
import bz2

In [2]:
df = pd.read_csv("final_data_season_3.csv.zip", sep=',', compression='zip')
df.head()


Unnamed: 0.1,Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,сlick,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device
0,0,9aa9b28f32adfc833634f28bb9703e83,216,219,Na,Fixed,7321,2259,False,2,3,2,windows,chrome,0.06,False
1,1,1ede4e457f5031ec1640dbe1259a4616,216,217,FirstView,Fixed,7321,2259,False,2,3,2,windows,ie,0.06,False
2,2,fca79371fc0f67ac6264f558bc664fd6,216,217,Na,Fixed,7323,2259,False,2,3,2,windows,chrome,0.07,False
3,3,7717c608f688ea3b087e292c91604ea5,183,184,Na,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False
4,4,35528d946791bc45b75ded2a2dbdabfe,3,3,FirstView,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False


In [3]:
categories = {
    10006: "Long-term interest/news",
    10024: "Long-term interest/education",
    10031: "Long-term interest/automobile",
    10048: "Long-term interest/real estate",
    10052: "Long-term interest/IT",
    10057: "Long-term interest/electronic game",
    10059: "Long-term interest/fashion",
    10063: "Long-term interest/entertainment",
    10067: "Long-term interest/luxury",
    10074: "Long-term interest/home and lifestyle",
    10075: "Long-term interest/health",
    10076: "Long-term interest/food",
    10077: "Long-term interest/divine",
    10079: "Long-term interest/motherhood&parenting",
    10083: "Long-term interest/sports",
    10093: "Long-term interest/travel&outdoors",
    10102: "Long-term interest/social",
    10684: "In-market/3c product",
    11092: "In-market/appliances",
    11278: "In-market/clothing, shoes&bags",
    11379: "In-market/Beauty& Personal Care",
    11423: "In-market/household&home improvement",
    11512: "In-market/infant&mom products",
    11576: "In-market/sports item",
    11632: "In-market/outdoor",
    11680: "In-market/health care products",
    11724: "In-market/luxury",
    11944: "In-market/real estate",
    13042: "In-market/automobile",
    13403: "In-market/finance",
    13496: "In-market/travel",
    13678: "In-market/education",
    13776: "In-market/service",
    13800: "Long-term interest/art&photography&design",
    13866: "Long-term interest/online literature",
    13874: "In-market/electronic game",
    14273: "Long-term interest/3c",
    16593: "In-market/book",
    16617: "In-market/medicine",
    16661: "In-market/food&drink",
    16706: "Long-term interest/culture",
    16751: "Long-term interest/sex",
    10110: "Demographic/gender/male",
    10111: "Demographic/gender/female"
}

In [4]:
def parse_user_profile(user_profile_ids):
    if user_profile_ids == "null":
        return pd.Series({
            'is_male': False,
            'is_long_term_interest': False,
            'is_in_market': False,
            'category_terms': ''
        })
    profile_ids = map(int, user_profile_ids.split(','))
    is_male = any(id_ == 10110 for id_ in profile_ids)
    is_long_term_interest = any("Long-term" in categories.get(id, '') for id in profile_ids)
    is_in_market = any("Long-term" in categories.get(id, '') for id in profile_ids)
    categories_set = {categories[id_].split('/')[-1] for id_ in profile_ids if id_ in categories}
    return pd.Series({
        'is_male': is_male,
        'is_long_term_interest': is_long_term_interest,
        'is_in_market': is_in_market,
        'category_terms': ', '.join(categories_set)
    })


In [5]:
def parse_clk_logs(input_dir):
    clk_logs = []
    tags_final = pd.DataFrame(columns=["bin_id", "user_profile_ids"])
    for file in tqdm.tqdm(os.listdir(input_dir)):
        if not file.startswith("imp"):
            continue
        print(file)
        with bz2.open(os.path.join(input_dir, file), "rt") as bz_file:
            for line in bz_file:
                l = line.rstrip('\n').split('\t')
                clk_logs.append([l[0], l[-1]])
        tags = pd.DataFrame(clk_logs, columns=["bin_id", "user_profile_ids"])
        tags_features = tags['user_profile_ids'].apply(parse_user_profile)
        tags_final = pd.concat([tags_final, tags.join(tags_features)])
    return tags_final

In [6]:
tags_final = parse_clk_logs("clk_and_imp_season_3")
tags_final.head()

  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
 80%|████████  | 8/10 [42:45<16:34, 497.12s/it]IOStream.flush timed out
  tags_final = pd.concat([tags_final, tags.join(tags_features)])
100%|██████████| 10/10 [56:06<00:00, 336.61s/it]


Unnamed: 0,bin_id,user_profile_ids,is_male,is_long_term_interest,is_in_market,category_terms
0,dbe06f7b386d619d3ab4dde3c93ed6d3,"10057,10059,14273,13866,10006,10110,10031,1005...",True,True,True,entertainment
1,9d41c92515ba05d44fe6d9aa5a28fd69,,False,False,False,
2,c5e7563166c1f5477478371d937e9eb0,"10057,10048,13800,16661,10079,10077,10075,1009...",True,True,False,
3,dd5c2a9324de82aa07af8ce0cf4e348,"10057,10059,10077,10075,10083,10006,10111,1012...",False,False,False,
4,709b7cffadaefd399738439386ece34d,"11278,13800,10684,13042,10006,10110,10123,1377...",True,True,True,


In [8]:
df = df.merge(tags_final, left_on='bid_id', right_on='bin_id', how='left')
df.head()

Unnamed: 0.1,Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,сlick,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device,bin_id,user_profile_ids,is_male,is_long_term_interest,is_in_market,category_terms
0,0,9aa9b28f32adfc833634f28bb9703e83,216,219,Na,Fixed,7321,2259,False,2,3,2,windows,chrome,0.06,False,,,,,,
1,1,1ede4e457f5031ec1640dbe1259a4616,216,217,FirstView,Fixed,7321,2259,False,2,3,2,windows,ie,0.06,False,,,,,,
2,2,fca79371fc0f67ac6264f558bc664fd6,216,217,Na,Fixed,7323,2259,False,2,3,2,windows,chrome,0.07,False,,,,,,
3,3,7717c608f688ea3b087e292c91604ea5,183,184,Na,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,
4,4,35528d946791bc45b75ded2a2dbdabfe,3,3,FirstView,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,


In [4]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,сlick,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device,bin_id,user_profile_ids,is_male,is_long_term_interest,is_in_market,category_terms
0,9aa9b28f32adfc833634f28bb9703e83,216,219,Na,Fixed,7321,2259,False,2,3,2,windows,chrome,0.06,False,,,,,,
1,1ede4e457f5031ec1640dbe1259a4616,216,217,FirstView,Fixed,7321,2259,False,2,3,2,windows,ie,0.06,False,,,,,,
2,fca79371fc0f67ac6264f558bc664fd6,216,217,Na,Fixed,7323,2259,False,2,3,2,windows,chrome,0.07,False,,,,,,
3,7717c608f688ea3b087e292c91604ea5,183,184,Na,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,
4,35528d946791bc45b75ded2a2dbdabfe,3,3,FirstView,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,


In [5]:
df = df.drop('bin_id', axis=1)

In [6]:
clusters = {
    "is_info_cat": ["news", "education", "book","online literature", "IT", "finance"],
    "is_products_cat": [ "3c product", "clothing, shoes&bags", "sports item", "appliances", "health care products"],
    "is_service_cat": ["real estate", "service", "medicine", "health"],
    "is_entertainment_cat": ["entertainment", "electronic game", "travel", "art&photography&design",
                            "outdoor", "sports", "travel&outdoors",  "automobile",
                            "food&drink", "culture", "sex", "luxury", "home and lifestyle",
                            "food", "divine", "social"],
    "is_girly_cat": ["fashion", "Beauty& Personal Care", "motherhood&parenting", "household&home improvement",
                    "infant&mom products"],

}


In [7]:
def check_cluster(row, category_list):
    if pd.isna(row):
        return False
    present_categories = set(row.split(', '))
    return any(cat in present_categories for cat in category_list)

for cluster_name, category_list in clusters.items():
    df[cluster_name] = df['category_terms'].apply(lambda x: check_cluster(x, category_list))
df.head()

Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,сlick,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device,user_profile_ids,is_male,is_long_term_interest,is_in_market,category_terms,is_info_cat,is_products_cat,is_service_cat,is_entertainment_cat,is_girly_cat
0,9aa9b28f32adfc833634f28bb9703e83,216,219,Na,Fixed,7321,2259,False,2,3,2,windows,chrome,0.06,False,,,,,,False,False,False,False,False
1,1ede4e457f5031ec1640dbe1259a4616,216,217,FirstView,Fixed,7321,2259,False,2,3,2,windows,ie,0.06,False,,,,,,False,False,False,False,False
2,fca79371fc0f67ac6264f558bc664fd6,216,217,Na,Fixed,7323,2259,False,2,3,2,windows,chrome,0.07,False,,,,,,False,False,False,False,False
3,7717c608f688ea3b087e292c91604ea5,183,184,Na,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,False,False,False,False,False
4,35528d946791bc45b75ded2a2dbdabfe,3,3,FirstView,Fixed,10722,2821,False,2,3,2,windows,ie,0.07,False,,,,,,False,False,False,False,False


In [8]:
pd.unique(df['user_profile_ids']), len(pd.unique(df['user_profile_ids']))

(array([nan, '13866', '10063', ...,
        '14273,10083,10102,10006,10031,10111,10114,10063',
        '10048,10057,10684,13496,10093,10102,10006,10123,10147,13403,10063,10116',
        '10048,10057,10059,10684,14273,10077,10093,10102,13866,10006,10111,10146,10052,16753,10063'],
       dtype=object),
 745661)

In [10]:
df = df.drop('user_profile_ids', axis=1).drop('category_terms', axis=1)

In [11]:
df.dtypes

bid_id                    object
region_id                  int64
city_id                    int64
ad_slot_visibility        object
ad_slot_format            object
creative_id                int64
advertiser_id              int64
сlick                       bool
part_of_day                int64
weekday                    int64
weekend                    int64
os                        object
browser                   object
ad_slot_screen_share     float64
is_mobile_device            bool
is_male                   object
is_long_term_interest     object
is_in_market              object
is_info_cat                 bool
is_products_cat             bool
is_service_cat              bool
is_entertainment_cat        bool
is_girly_cat                bool
dtype: object

In [13]:
df['click'] = df['сlick']
df = df.drop('сlick', axis=1)
df['click']

0           False
1           False
2           False
3           False
4           False
            ...  
21341590    False
21341591    False
21341592    False
21341593    False
21341594    False
Name: click, Length: 21341595, dtype: bool

In [14]:
df.to_csv("parsed_final_data_season_3.csv.zip", compression='zip')