In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from collections import Counter
from sklearn.preprocessing import LabelBinarizer

In [2]:
data_dir = '../../../data/raw/preliminary_contest_data/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def counter_to_csv(counter, filepath):
    with open(filepath, "w") as f:
        f.write("value,counts\n")
        for k, v in counter.most_common():
            f.write("{},{}\n".format(k, v))

In [5]:
def save_as_pickle(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [6]:
df_ad = load('adFeature.csv')

In [7]:
# show unique value counts and unique values
for feat_name in df_ad.columns:
    print(feat_name)
    print("-" * len(feat_name))
    series = df_ad[feat_name]
    print("Unique Value Counts: {}".format(series.nunique()))
    print("Unique Value: {}".format(series.unique()))
    print()

aid
---
Unique Value Counts: 173
Unique Value: [ 177 2050 1716  336  671  529  927 1714  977  450 1749  404  302 1202  838
 1842  875  846 1962 1171 1781 1622 1254  231  191  113 1057  145 1284   86
  686 1291  853  765 2031 1507 2054 1599 1338 1672 1350 1415  420 1021 1931
 1925  432 1044  117 1930 1950 1621  206 2196  121 1508 2068  894 1317 1871
  471  436  174  681 1957 1468 1242 1023 1991 1998 1966  903 2154  951 1407
 1215 1429 1017  647  205 2048 1107  966 2216 1904 1277  562  272  561  164
 1375  519 1819 1335  960 1847 1503  792 2205 1351 1728 1496  369  725  727
  699  313 2197 1027 2047 1119 1449  886 1580  975   74  136  311 1140 1910
    7 1827 1483  613 1746 1790 1230  692 1596 1671  212  543  939  932 1379
  411 2112   18 1970 1894 2201 1841    6  516 2066  624 1605 1377  914   12
 2169 1182   70   98 1201  286 2118  173 1512  748 1566 2044 1085  454  916
  688  660 1635 1918 1530  389 1940 2013]

advertiserId
------------
Unique Value Counts: 79
Unique Value: [  8203  1

In [8]:
counter_dir = '../../../data/counter/preliminary_contest_data/'  # to save counter
vocab_dir = '../../../data/vocabulary/preliminary_contest_data/'  # to save vocabulary
os.makedirs(counter_dir, exist_ok=True)
os.makedirs(vocab_dir, exist_ok=True)

counter_dict = {}
vocab_dict = {}
for feat_name in df_ad.columns:
    print("Processing '{}'...".format(feat_name))
    series = df_ad[feat_name]
    counter = Counter(series)
    
    # save counter
    counter_file = "adFeature.[featureName='{}'].csv".format(feat_name)
    counter_path = os.path.join(counter_dir, counter_file)
    counter_to_csv(counter, counter_path)
    
    # save vocabulary
    vocab = list(counter.keys())
    vocab_file = "adFeature.[featureName='{}'].pkl".format(feat_name)
    vocab_path = os.path.join(vocab_dir, vocab_file)
    save_as_pickle(vocab, vocab_path)
    
    counter_dict[feat_name] = counter
    vocab_dict[feat_name] = vocab

Processing 'aid'...
Processing 'advertiserId'...
Processing 'campaignId'...
Processing 'creativeId'...
Processing 'creativeSize'...
Processing 'adCategoryId'...
Processing 'productId'...
Processing 'productType'...


In [9]:
def merge_counter_dict(counter_dict):
    final_counter = Counter()
    for name, counter in counter_dict.items():
        new_counter = Counter({"{}_{}".format(name, k): v for k, v in counter.items()})
        final_counter += new_counter
    return final_counter

In [10]:
counter_file = "adFeature.csv"
counter_path = os.path.join(counter_dir, counter_file)
large_counter = merge_counter_dict(counter_dict)
counter_to_csv(large_counter, counter_path)

In [11]:
vocab_file = "adFeature.pkl"
vocab_path = os.path.join(vocab_dir, vocab_file)    
large_vocab = list(large_counter.keys())
save_as_pickle(large_vocab, vocab_path)

In [12]:
cntvec_dir = '../../../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
os.makedirs(cntvec_dir, exist_ok=True)
aid_file = "aid.pkl"
aid_path = os.path.join(cntvec_dir, aid_file)
save_as_pickle(df_ad['aid'].values, aid_path)  # save aid for further analysis

for feat_name in df_ad.columns:
    series = df_ad[feat_name]
    lb_vectorizer = LabelBinarizer(sparse_output=True)
    lb_vec = lb_vectorizer.fit_transform(series).astype(np.int8)
    cntvec_file = "adFeature.[featureName='{}'].pkl".format(feat_name)
    cntvec_path = os.path.join(cntvec_dir, cntvec_file)
    save_as_pickle((lb_vectorizer.classes_, lb_vec), cntvec_path)