In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import json

import pandas as pd
from scipy.sparse import save_npz

In [None]:
WORKSPACE_PATH = '/content/drive/MyDrive/AnnaPadalko/'
WORKSPACE_TMP_PATH = WORKSPACE_PATH + 'tmp/'

INPUT_PATH = WORKSPACE_PATH + 'input/'
TEST_INPUT_PATH = INPUT_PATH + 'test.json'
VAL_INPUT_PATH = INPUT_PATH + 'val.json'
TRAIN_INPUT_PATH = INPUT_PATH + 'train.json'
GEN_CAT_MAPPING_INPUT_PATH = INPUT_PATH + 'general_categories_mapping.json'

In [None]:
with open(TRAIN_INPUT_PATH) as file:
    train_file = file.read()

train_json = json.loads(train_file)

with open(GEN_CAT_MAPPING_INPUT_PATH) as file:
    gen_cat_file = file.read()

gen_cat_json = json.loads(gen_cat_file)
gen_cat_df = pd.DataFrame(list(gen_cat_json.items()), columns=['mapping', 'category'])

In [None]:
# Регулярное выражение для женских категорий
female_pattern = '|'.join(["женщин", "девуш", "девоч", "женск", "дам", "барышн", "государы"])
# Фильтрация для женских категорий
gen_cat_women_df = gen_cat_df[gen_cat_df['category'].str.lower().str.contains(female_pattern, na=False)]

# Регулярное выражение для мужских категорий
male_pattern = '|'.join(["мужч", "мальч", "мужск", "юнош", "господар"])
# Фильтрация для мужских категорий
gen_cat_men_df = gen_cat_df[gen_cat_df['category'].str.lower().str.contains(male_pattern, na=False)]

In [None]:
def collect_keys(d, parent_key='', sep='.'):
    keys = []
    for k, v in d.items():
        if not isinstance(k, str):
            continue

        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            keys.extend(collect_keys(v, new_key, sep=sep))
        elif isinstance(v, list):
            for item in v:
                if isinstance(item, dict):
                    keys.extend(collect_keys(item, new_key, sep=sep))
        else:
            keys.append(new_key)
    return keys

def remove_user_prefix(keys, pattern, sep='.'):
    processed_keys = []
    for key in keys:
        if isinstance(key, str):
            parts = key.split(sep)
            if re.match(pattern, parts[0]):
                processed_keys.append(sep.join(parts[1:]))
            else:
                processed_keys.append(key)
        elif isinstance(key, tuple):
            processed_key = tuple(remove_user_prefix(list(key), pattern))
            processed_keys.append(processed_key)
        else:
            processed_keys.append(key)
    return processed_keys

In [None]:
keys = collect_keys(train_json)
pattern = r'^user_\d+'
keys = remove_user_prefix(keys, pattern)
keys = list(set(keys))
keys.sort()
keys

['features.exchange-sessions.accepted-at',
 'features.exchange-sessions.accepted-site-id',
 'features.exchange-sessions.clicks.clicked-at',
 'features.exchange-sessions.clicks.site-id',
 'features.exchange-sessions.landed-at',
 'features.last-visits-in-categories.category',
 'features.last-visits-in-categories.last-visit-at',
 'features.orders.orders.created-at',
 'features.orders.orders.items.brand-id',
 'features.orders.orders.items.count',
 'features.orders.orders.items.id',
 'features.orders.site-id',
 'features.site-meta.frequency',
 'features.site-meta.monetary',
 'features.site-meta.recency',
 'features.site-meta.site-id',
 'features.visits.first-seen',
 'features.visits.last-seen',
 'features.visits.site-id',
 'features.visits.visits.pages-count',
 'features.visits.visits.session-duration',
 'features.visits.visits.visited-at',
 'target']

In [None]:
first_five_pairs = list(train_json.items())[5:6]

for key, value in first_five_pairs:
    print(f"{key}: {value}\n\n")

user_6: {'target': 'female', 'features': {'orders': [{'site-id': 123, 'orders': [{'created-at': 1630352618, 'items': [{'id': 'item_75', 'count': 1, 'general-category-path': [7812065, 7812006, 7811879, 7877999], 'brand-id': 2458}, {'id': 'item_76', 'count': 1, 'general-category-path': [7811945, 7811896, 7811873, 7877999], 'brand-id': 2458}, {'id': 'item_77', 'count': 1, 'general-category-path': [7811945, 7811896, 7811873, 7877999], 'brand-id': 2458}]}, {'created-at': 1648999970, 'items': [{'id': 'item_78', 'count': 1, 'general-category-path': [7811945, 7811896, 7811873, 7877999], 'brand-id': 2458}, {'id': 'item_79', 'count': 1, 'general-category-path': [7811903, 7811873, 7877999], 'brand-id': 2458}]}, {'created-at': 1653315536, 'items': [{'id': 'item_80', 'count': 1, 'general-category-path': [7811945, 7811896, 7811873, 7877999], 'brand-id': 2458}, {'id': 'item_81', 'count': 1, 'general-category-path': [7812156, 7811877, 7877999], 'brand-id': 2458}]}]}, {'site-id': 50, 'orders': [{'creat

```
      feat1               feat2  
user1 agg(sessions_feat1) agg(sessions_feat2)  
user2 agg(sessions_feat1) agg(sessions_feat2)  
```

In [None]:
import numpy as np

In [None]:
aggregated_features = {}

for user_record in list(train_json.items()):
    user_id = user_record[0]

    if 'features' not in user_record[1]:
        continue

    if 'exchange-sessions' not in user_record[1]['features']:
        continue

    exchange_sessions = user_record[1]['features']['exchange-sessions']

    unique_sites = set()
    total_sites = 0
    for session in exchange_sessions:
        total_sites += len(session['sites'])
        unique_sites.update(session['sites'])
    total_unique_sites = len(unique_sites)
    unique_sites_fraction = total_unique_sites / total_sites

    total_conversions = sum(1 for session in exchange_sessions if 'accepted-site-id' in session)
    conversion_rate = total_conversions / len(exchange_sessions) if exchange_sessions else 0

    total_sessions = len(exchange_sessions)

    avg_time_to_action = None
    time_to_action = []
    for session in exchange_sessions:
        if 'accepted-at' not in session:
            continue
        time_to_action.append(session['accepted-at'] - session['landed-at'])
    if time_to_action:
      avg_time_to_action = np.mean(time_to_action)

    aggregated_features[user_id] = {
        'total_unique_sites': total_unique_sites,
        'conversion_rate': conversion_rate,
        'total_sessions': total_sessions,
        'avg_time_to_action': avg_time_to_action,
        'unique_sites_fraction': unique_sites_fraction,
    }

In [None]:
df_exchange_sessions = pd.DataFrame(aggregated_features).T
df_exchange_sessions.head()

Unnamed: 0,total_unique_sites,conversion_rate,total_sessions,avg_time_to_action,unique_sites_fraction
user_6,300.0,0.625,8.0,168.4,0.391134
user_11,119.0,1.0,1.0,24.0,1.0
user_15,66.0,1.0,1.0,55.0,1.0
user_20,185.0,1.0,1.0,97.0,1.0
user_21,256.0,0.888889,9.0,63.875,0.218803


In [None]:
df_exchange_sessions.to_csv('exchange_sessions.csv')