In [None]:
import pandas as pd
import numpy as np
import json
import os
from zipfile import ZipFile
import zipfile
import ast
from joblib import Parallel, delayed

In [None]:
# function to parse json

def prepare(df):
    data = pd.DataFrame(list(df['user']))
    data['session_id'] = df['session_id']

    sites_for_user_list = []
    theme_events_list = []
    
    for i in range(data.shape[0]):
        s = pd.DataFrame.from_dict(list(data['sites_for_user'])[0])
        s['session_id'] = data.iloc[i, :]['session_id']
        sites_for_user_list.append(s)
        
        t = pd.DataFrame.from_dict(data['theme_events'].tolist()[0])
        t['session_id'] = data.iloc[i, :]['session_id']
        theme_events_list.append(t)
    
    sites_for_user = pd.concat(sites_for_user_list)
    sites_for_user['site_hash'] = sites_for_user['site_hash'].astype('str')
    sites_for_user['session_id'] = sites_for_user['session_id'].astype('str')    
    
    theme_events = pd.concat(theme_events_list)
    theme_events['site_hash'] = theme_events['site_hash'].astype('str')
    theme_events['session_id'] = theme_events['session_id'].astype('str')
    
    data_info = data.iloc[:, :-3]
    data_info['session_id'] = data['session_id']
    data_info['session_id'] = data_info['session_id'].astype('str')
    
    df_actions = df[['session_id', 'actions']]
    df_actions['site_hash'] = df_actions['actions'].apply(lambda x: list(x.keys()))
    df_values = df_actions['actions'].apply(lambda x: list(x.values())).explode()
    df_actions.drop(['actions'], axis=1, inplace=True)

    df_actions = df_actions.explode('site_hash')
    
    df_actions.reset_index(inplace=True)
    df_actions.drop(columns='index', inplace=True)
    
    df_actions['site_hash'] = df_actions['site_hash'].astype('str')
    df_actions['session_id'] = df_actions['session_id'].astype('str')
    df_actions['values'] = df_values.values
    df_actions['values'] = df_actions['values'].astype('int32')
    
    # merge all
    
    df_ready = df_actions.merge(sites_for_user, on = ['site_hash', 'session_id'], how='left')
    df_ready = df_ready.merge(theme_events, on = ['site_hash', 'session_id'], how='left')
    df_ready = df_ready.merge(data_info, on=['session_id'], how='left')
    
    df_ready['visits'] = df_ready['visits'].astype('float32', errors='ignore')
    df_ready['uniq_urls'] = df_ready['uniq_urls'].astype('float32', errors='ignore')
    df_ready['last_ts'] = df_ready['last_ts'].astype('float32', errors='ignore')
    df_ready['clicks'] = df_ready['clicks'].astype('float32', errors='ignore')
    df_ready['shows'] = df_ready['shows'].astype('float32', errors='ignore')
    df_ready['last_click_ts'] = df_ready['last_click_ts'].astype('float32', errors='ignore')
    df_ready['vid'] = df_ready['vid'].astype('category', errors='ignore')
    df_ready['vk_id'] = df_ready['vk_id'].astype('category', errors='ignore')
    df_ready['ok_id'] = df_ready['ok_id'].astype('category', errors='ignore')
    df_ready['email'] = df_ready['email'].astype('category', errors='ignore')
    df_ready['age'] = df_ready['age'].astype('int32', errors='ignore')
    df_ready['gender'] = df_ready['gender'].astype('category', errors='ignore')
    df_ready['geo'] = df_ready['geo'].astype('int32', errors='ignore')
    
    return df_ready

In [None]:
def make_dataframe(archive, name, k):
    i = 0
    df_list = []
    df_test = pd.DataFrame()
    with archive.open(name) as f:
        for line in f.readlines():
            d = json.loads(line)
            df_el = pd.DataFrame.from_dict(d, orient='index', columns = [i]).T
            df_list.append(df_el)
            i += 1

    df_test = pd.concat(df_list)
    
    df_ = prepare(df_test)
    df_.to_csv('df_test'+ str(k), index=False)
    print(k)
    return df_

In [None]:
# read files from archive and make dataframe

archive = ZipFile('test.zip', 'r')

names = archive.namelist()

data_test_list = Parallel(n_jobs=4)(delayed(make_dataframe(archive, names[k], k)) for k in range(100))

In [None]:
# merge all dataframes in one

test_list = []
for k in range(100):
    d = pd.read_csv('df_test'+ str(k))
    test_list.append(d)
    if k % 10 == 0:
        print(k)
data_test = pd.concat(test_list)

In [None]:
# add site_hash_types

site_hash_types = pd.read_csv('site_hash_types.tsv', sep='\t', header=None, names=['site_hash', 'type'])
le = preprocessing.LabelEncoder()
site_hash_types['type'] = le.fit_transform(site_hash_types['type'])

data_test = data_test.merge(site_hash_types, on=['site_hash'], how='left')

In [None]:
# add site_hash_weights

site_hash_weights = pd.read_csv('site_hash_weights.tsv', sep='\t', header=None, names=['site_hash', 'weight'])
site_hash_weights.drop_duplicates(['site_hash'], inplace=True)
data_test = data_test.merge(site_hash_weights, on=['site_hash'], how='left')

In [None]:
data_test.to_csv('data_train', index=False)