In [1]:
import csv
import tqdm
import numpy as np
import pandas as pd
import zipfile
import os
import bz2

In [11]:
df = pd.read_csv('final_test_data_season_3.csv.zip', compression='zip')

In [12]:
df.dtypes

Unnamed: 0                int64
bid_id                   object
region_id                 int64
city_id                   int64
ad_slot_visibility       object
ad_slot_format           object
creative_id               int64
advertiser_id             int64
user_profile_ids         object
click                      bool
part_of_day               int64
weekday                   int64
weekend                   int64
os                       object
browser                  object
ad_slot_screen_share    float64
is_mobile_device           bool
dtype: object

In [13]:
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,user_profile_ids,click,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device
0,b8c557a16cdd9cea7fa61df79bdb392d,216,233,OtherView,Na,7328,2259,10684142731005210133,False,1,1,2,windows,ie,0.03,False
1,ce5a2b810ada0cad122508b4a9d1e2fd,216,219,FirstView,Na,7323,2259,10006,False,1,1,2,windows,ie,0.07,False
2,61fb95442aaa7cade3761b7f329a1e73,216,229,OtherView,Na,7319,2259,101291002410120101451014210063,False,3,1,2,windows,chrome,0.04,False
3,7f9c3a72df3e4ded04870c08fcbcf72b,216,222,Na,Na,7323,2259,10684,False,3,1,2,windows,ie,0.07,False
4,de8192350f7e0f3223c43cf0c8cfb247,216,233,OtherView,Na,7330,2259,"10059,10117,10075,10006,10110,16706,10127,1340...",False,1,1,2,windows,chrome,0.06,False


In [5]:
categories = {
    10006: "Long-term interest/news",
    10024: "Long-term interest/education",
    10031: "Long-term interest/automobile",
    10048: "Long-term interest/real estate",
    10052: "Long-term interest/IT",
    10057: "Long-term interest/electronic game",
    10059: "Long-term interest/fashion",
    10063: "Long-term interest/entertainment",
    10067: "Long-term interest/luxury",
    10074: "Long-term interest/home and lifestyle",
    10075: "Long-term interest/health",
    10076: "Long-term interest/food",
    10077: "Long-term interest/divine",
    10079: "Long-term interest/motherhood&parenting",
    10083: "Long-term interest/sports",
    10093: "Long-term interest/travel&outdoors",
    10102: "Long-term interest/social",
    10684: "In-market/3c product",
    11092: "In-market/appliances",
    11278: "In-market/clothing, shoes&bags",
    11379: "In-market/Beauty& Personal Care",
    11423: "In-market/household&home improvement",
    11512: "In-market/infant&mom products",
    11576: "In-market/sports item",
    11632: "In-market/outdoor",
    11680: "In-market/health care products",
    11724: "In-market/luxury",
    11944: "In-market/real estate",
    13042: "In-market/automobile",
    13403: "In-market/finance",
    13496: "In-market/travel",
    13678: "In-market/education",
    13776: "In-market/service",
    13800: "Long-term interest/art&photography&design",
    13866: "Long-term interest/online literature",
    13874: "In-market/electronic game",
    14273: "Long-term interest/3c",
    16593: "In-market/book",
    16617: "In-market/medicine",
    16661: "In-market/food&drink",
    16706: "Long-term interest/culture",
    16751: "Long-term interest/sex",
    10110: "Demographic/gender/male",
    10111: "Demographic/gender/female"
}

In [6]:
def parse_user_profile(user_profile_ids):
    if user_profile_ids == "null":
        return pd.Series({
            'is_male': False,
            'is_long_term_interest': False,
            'is_in_market': False,
            'category_terms': ''
        })
    profile_ids = map(int, user_profile_ids.split(','))
    is_male = any(id_ == 10110 for id_ in profile_ids)
    is_long_term_interest = any("Long-term" in categories.get(id, '') for id in profile_ids)
    is_in_market = any("Long-term" in categories.get(id, '') for id in profile_ids)
    categories_set = {categories[id_].split('/')[-1] for id_ in profile_ids if id_ in categories}
    return pd.Series({
        'is_male': is_male,
        'is_long_term_interest': is_long_term_interest,
        'is_in_market': is_in_market,
        'category_terms': ', '.join(categories_set)
    })


In [8]:
tags_final = df['user_profile_ids'].apply(parse_user_profile)
tags_final.head()

Unnamed: 0,is_male,is_long_term_interest,is_in_market,category_terms
0,False,False,False,
1,False,False,False,
2,False,False,False,
3,False,False,False,
4,True,True,True,


In [14]:
df_1 = pd.concat([df, tags_final], axis=1)
df_1

Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,user_profile_ids,click,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device,is_male,is_long_term_interest,is_in_market,category_terms
0,b8c557a16cdd9cea7fa61df79bdb392d,216,233,OtherView,Na,7328,2259,10684142731005210133,False,1,1,2,windows,ie,0.03,False,False,False,False,
1,ce5a2b810ada0cad122508b4a9d1e2fd,216,219,FirstView,Na,7323,2259,10006,False,1,1,2,windows,ie,0.07,False,False,False,False,
2,61fb95442aaa7cade3761b7f329a1e73,216,229,OtherView,Na,7319,2259,101291002410120101451014210063,False,3,1,2,windows,chrome,0.04,False,False,False,False,
3,7f9c3a72df3e4ded04870c08fcbcf72b,216,222,Na,Na,7323,2259,10684,False,3,1,2,windows,ie,0.07,False,False,False,False,
4,de8192350f7e0f3223c43cf0c8cfb247,216,233,OtherView,Na,7330,2259,"10059,10117,10075,10006,10110,16706,10127,1340...",False,1,1,2,windows,chrome,0.06,False,True,True,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1579081,b74bfc1f007548e5161364195364fd0,216,222,Na,Na,12632,2261,10083101021002410063,False,2,0,2,windows,chrome,0.08,False,False,False,False,
1579082,11544056b9ba1dac7f1a8546492a9a7,55,56,OtherView,Na,12616,2261,0,False,2,0,2,windows,chrome,0.04,False,False,False,False,
1579083,f7e3b0a2187f9498c3b6d8c347599b91,216,227,Na,Na,12615,2261,101171008313866100061012713403,False,2,0,2,windows,ie,0.10,False,False,False,False,
1579084,cbde18f2f6da699627cdc3a763270ef2,253,254,OtherView,Na,12623,2261,0,False,2,0,2,windows,ie,0.09,False,False,False,False,


In [15]:
df = df_1

In [16]:
clusters = {
    "is_info_cat": ["news", "education", "book","online literature", "IT", "finance"],
    "is_products_cat": [ "3c product", "clothing, shoes&bags", "sports item", "appliances", "health care products"],
    "is_service_cat": ["real estate", "service", "medicine", "health"],
    "is_entertainment_cat": ["entertainment", "electronic game", "travel", "art&photography&design",
                            "outdoor", "sports", "travel&outdoors",  "automobile",
                            "food&drink", "culture", "sex", "luxury", "home and lifestyle",
                            "food", "divine", "social"],
    "is_girly_cat": ["fashion", "Beauty& Personal Care", "motherhood&parenting", "household&home improvement",
                    "infant&mom products"],
}

In [17]:
def check_cluster(row, category_list):
    if pd.isna(row):
        return False
    present_categories = set(row.split(', '))
    return any(cat in present_categories for cat in category_list)

for cluster_name, category_list in clusters.items():
    df[cluster_name] = df['category_terms'].apply(lambda x: check_cluster(x, category_list))
df.head()

Unnamed: 0,bid_id,region_id,city_id,ad_slot_visibility,ad_slot_format,creative_id,advertiser_id,user_profile_ids,click,part_of_day,weekday,weekend,os,browser,ad_slot_screen_share,is_mobile_device,is_male,is_long_term_interest,is_in_market,category_terms,is_info_cat,is_products_cat,is_service_cat,is_entertainment_cat,is_girly_cat
0,b8c557a16cdd9cea7fa61df79bdb392d,216,233,OtherView,Na,7328,2259,10684142731005210133,False,1,1,2,windows,ie,0.03,False,False,False,False,,False,False,False,False,False
1,ce5a2b810ada0cad122508b4a9d1e2fd,216,219,FirstView,Na,7323,2259,10006,False,1,1,2,windows,ie,0.07,False,False,False,False,,False,False,False,False,False
2,61fb95442aaa7cade3761b7f329a1e73,216,229,OtherView,Na,7319,2259,101291002410120101451014210063,False,3,1,2,windows,chrome,0.04,False,False,False,False,,False,False,False,False,False
3,7f9c3a72df3e4ded04870c08fcbcf72b,216,222,Na,Na,7323,2259,10684,False,3,1,2,windows,ie,0.07,False,False,False,False,,False,False,False,False,False
4,de8192350f7e0f3223c43cf0c8cfb247,216,233,OtherView,Na,7330,2259,"10059,10117,10075,10006,10110,16706,10127,1340...",False,1,1,2,windows,chrome,0.06,False,True,True,True,,False,False,False,False,False


In [18]:
df.dtypes

bid_id                    object
region_id                  int64
city_id                    int64
ad_slot_visibility        object
ad_slot_format            object
creative_id                int64
advertiser_id              int64
user_profile_ids          object
click                       bool
part_of_day                int64
weekday                    int64
weekend                    int64
os                        object
browser                   object
ad_slot_screen_share     float64
is_mobile_device            bool
is_male                     bool
is_long_term_interest       bool
is_in_market                bool
category_terms            object
is_info_cat                 bool
is_products_cat             bool
is_service_cat              bool
is_entertainment_cat        bool
is_girly_cat                bool
dtype: object

In [19]:
pd.unique(df['user_profile_ids']), len(pd.unique(df['user_profile_ids']))

(array(['10684,14273,10052,10133', '10006',
        '10129,10024,10120,10145,10142,10063', ...,
        '10117,10083,10006,10024,10111,10127,13403',
        '10117,10083,13866,10006,10127,13403',
        '10059,10083,10006,10110,13776,10126,13403,10063,10116'],
       dtype=object),
 399768)

In [20]:
pd.unique(df['category_terms']), len(pd.unique(df['category_terms']))

(array(['', 'entertainment', 'finance, entertainment', ...,
        'online literature, health care products, fashion, education, social, home and lifestyle, food, news, art&photography&design, health, luxury',
        'online literature, health, culture, education, automobile, social, news, travel&outdoors',
        'online literature, 3c product, IT, motherhood&parenting, fashion, education, travel&outdoors, automobile, social, food, news, health, luxury'],
       dtype=object),
 6306)

In [21]:
df = df.drop('user_profile_ids', axis=1).drop('category_terms', axis=1)

In [22]:
df['click']

0          False
1          False
2          False
3          False
4          False
           ...  
1579081    False
1579082    False
1579083    False
1579084    False
1579085    False
Name: click, Length: 1579086, dtype: bool

In [23]:
df.to_csv("final_test_data_season_3.csv.zip", compression='zip')