In [10]:
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle

In [2]:
def temp():
    paths = []
    dirs = os.listdir('train_data')
    for dir in dirs:
        filenames = os.listdir(f'train_data/{dir}')
        paths += [f'train_data/{dir}/{filename}' for filename in filenames]
    paths = sorted(paths)
    return paths
data_filepaths = temp()

# String data encoding

In [None]:
if not os.path.exists('./normalized_data'):
    os.mkdir('./normalized_data')
if not os.path.exists('./label_encoders'):
    os.mkdir('./label_encoders')

In [46]:
def encode_categoric_string_feature(feature_name):
    unique_ids = []
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = df[feature_name].unique()
        del df
        unique_ids = np.unique(np.concatenate([unique_ids, df_unique_ids]))
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

def encode_categoric_string_feature_pandas(feature_name):
    unique_ids = None
    for filepath in tqdm(data_filepaths, desc=feature_name.replace('_', ' ').capitalize()):
        df = pd.read_csv(filepath, usecols=[feature_name])
        df_unique_ids = pd.Series(df[feature_name].unique())
        del df
        if unique_ids is None:
            unique_ids = df_unique_ids
        else:
            unique_ids = pd.Series(pd.concat([unique_ids, df_unique_ids]).unique())
        del df_unique_ids
    unique_ids_df = pd.DataFrame({
        feature_name: unique_ids
    })
    unique_ids_df.to_csv(f'./normalized_data/{feature_name}.csv', index=False)
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_ids)
    with open(f'./label_encoders/{feature_name}.pickle', 'wb') as handle:
        pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
]
for feature in features_string_categories:
    encode_categoric_string_feature(feature)

encode_categoric_string_feature_pandas('country_code')
encode_categoric_string_feature_pandas('region')

Country code:   0%|          | 0/69 [00:00<?, ?it/s]

Region:   0%|          | 0/69 [00:00<?, ?it/s]

In [63]:
def normalize_file(encoders, filepath):
    main_dir = os.path.join('.', 'normalized_train_data')
    if not os.path.exists(main_dir):
        os.mkdir(main_dir)
    
    df = pd.read_csv(filepath)

    for column, encoder in tqdm(encoders, desc=filepath):
        df[column] = encoder.transform(df[column].to_list())
    
    directory_path, filename = os.path.split(filepath)
    _, parent_dir = os.path.split(directory_path)
    parent_dir = os.path.join('.', 'normalized_train_data', parent_dir)
    if not os.path.exists(parent_dir):
        os.mkdir(parent_dir)
    df.to_csv(os.path.join(parent_dir, filename))

def load_encoders(features):
    column_encoders = []
    for feature in features:
        with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
            encoder = pickle.load(handle)
        column_encoders.append((feature, encoder))
    return column_encoders

features_string_categories = [
    'user_id_hash',
    'target_id_hash',
    'syndicator_id_hash',
    'campaign_id_hash',
    'target_item_taxonomy',
    'placement_id_hash',
    'publisher_id_hash',
    'source_id_hash',
    'source_item_type',
    'browser_platform',
    'country_code',
    'region',
]
column_encoders = load_encoders(features_string_categories)
for filepaths in data_filepaths[10:]:
    normalize_file(column_encoders, filepaths)

train_data/train1/part-00010.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00011.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00012.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00013.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00014.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00015.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00016.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00017.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00018.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00019.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train2/part-00020.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00021.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00022.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00023.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00024.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00025.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00026.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00027.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00028.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00029.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train3/part-00030.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00031.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00032.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00033.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00034.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00035.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00036.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00037.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00038.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00039.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train4/part-00040.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00041.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00042.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00043.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00044.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00045.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00046.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00047.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00048.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00049.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train5/part-00050.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00051.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00052.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00053.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00054.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00055.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00056.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00057.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00058.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00059.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train6/part-00060.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00061.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00062.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00063.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00064.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00065.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00066.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00067.csv:   0%|          | 0/12 [00:00<?, ?it/s]

train_data/train7/part-00068.csv:   0%|          | 0/12 [00:00<?, ?it/s]

In [54]:
df_submission = pd.read_csv('./submission_example.csv')
df_test = pd.read_csv('./test_file.csv')

# Test set deviations from training data

In [60]:
print('new target_id_hash', (~df_test['target_id_hash'].isin(pd.read_csv('./normalized_data/target_id_hash.csv')['target_id_hash'])).sum())
print('new syndicator_id_hash', (~df_test['syndicator_id_hash'].isin(pd.read_csv('./normalized_data/syndicator_id_hash.csv')['syndicator_id_hash'])).sum())
print('new campaign_id_hash', (~df_test['campaign_id_hash'].isin(pd.read_csv('./normalized_data/campaign_id_hash.csv')['campaign_id_hash'])).sum())
print('new target_item_taxonomy', (~df_test['target_item_taxonomy'].isin(pd.read_csv('./normalized_data/target_item_taxonomy.csv')['target_item_taxonomy'])).sum())
print('new placement_id_hash', (~df_test['placement_id_hash'].isin(pd.read_csv('./normalized_data/placement_id_hash.csv')['placement_id_hash'])).sum())
print('new publisher_id_hash', (~df_test['publisher_id_hash'].isin(pd.read_csv('./normalized_data/publisher_id_hash.csv')['publisher_id_hash'])).sum())
print('new source_id_hash', (~df_test['source_id_hash'].isin(pd.read_csv('./normalized_data/source_id_hash.csv')['source_id_hash'])).sum())
print('new source_item_type', (~df_test['source_item_type'].isin(pd.read_csv('./normalized_data/source_item_type.csv')['source_item_type'])).sum())
print('new browser_platform', (~df_test['browser_platform'].isin(pd.read_csv('./normalized_data/browser_platform.csv')['browser_platform'])).sum())
print('new country_code', (~df_test['country_code'].isin(pd.read_csv('./normalized_data/country_code.csv')['country_code'])).sum())
print('new region', (~df_test['region'].isin(pd.read_csv('./normalized_data/region.csv')['region'])).sum())

new target_id_hash 16883
new syndicator_id_hash 1832
new campaign_id_hash 12240
new target_item_taxonomy 0
new placement_id_hash 1
new publisher_id_hash 0
new source_id_hash 137385
new source_item_type 0
new browser_platform 0
new country_code 0
new region 2


# EDA