# Импорты

In [1]:
import pandas as pd
import json
from collections import Counter

# Обработка тренировочного датасета

In [2]:
# зададим директорию
file_path_cut = 'train.json'

In [3]:
# загружаем наш тренировочный набор данных
df_train = pd.read_json(file_path_cut)

In [4]:
# смотрим сверху - ужас наши юзеры в колонках мы так не привыкли
df_train.head()

Unnamed: 0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,user_10,...,user_127746,user_127747,user_127748,user_127749,user_127750,user_127751,user_127752,user_127753,user_127754,user_127755
target,female,female,male,male,female,female,female,female,female,male,...,male,female,male,female,male,male,male,female,female,female
features,"{'orders': [{'site-id': 1, 'orders': [{'create...","{'visits': [{'site-id': 3, 'first-seen': 16962...","{'orders': [{'site-id': 21, 'orders': [{'creat...","{'orders': [{'site-id': 2, 'orders': [{'create...","{'orders': [{'site-id': 39, 'orders': [{'creat...","{'orders': [{'site-id': 123, 'orders': [{'crea...","{'visits': [{'site-id': 225, 'first-seen': 169...","{'orders': [{'site-id': 196, 'orders': [{'crea...","{'orders': [{'site-id': 307, 'orders': [{'crea...","{'orders': [{'site-id': 49, 'orders': [{'creat...",...,"{'orders': [{'site-id': 173, 'orders': [{'crea...","{'visits': [{'site-id': 3, 'first-seen': 16897...","{'orders': [{'site-id': 348, 'orders': [{'crea...","{'orders': [{'site-id': 39, 'orders': [{'creat...","{'orders': [{'site-id': 41, 'orders': [{'creat...","{'orders': [{'site-id': 307, 'orders': [{'crea...","{'orders': [{'site-id': 147, 'orders': [{'crea...","{'orders': [{'site-id': 20, 'orders': [{'creat...","{'orders': [{'site-id': 407, 'orders': [{'crea...","{'orders': [{'site-id': 726, 'orders': [{'crea..."


In [5]:
# транспонируем датасет что бы он принял человеческий вид
df_transposed = df_train.T

# зададим имена колонок (важно когда будем работать с тестовым набором то колонки таргет у нас не будет)
df_transposed.reset_index(inplace=True)
df_transposed.rename(columns={'index': 'user_id', 0: 'target', 1: 'features'}, inplace=True)

# Посмотрим что получилось
df_transposed.head()


Unnamed: 0,user_id,target,features
0,user_1,female,"{'orders': [{'site-id': 1, 'orders': [{'create..."
1,user_2,female,"{'visits': [{'site-id': 3, 'first-seen': 16962..."
2,user_3,male,"{'orders': [{'site-id': 21, 'orders': [{'creat..."
3,user_4,male,"{'orders': [{'site-id': 2, 'orders': [{'create..."
4,user_5,female,"{'orders': [{'site-id': 39, 'orders': [{'creat..."


# Создадим правило определения пола - Пол определяется сайтами и покупками человека

In [6]:
# найдем все унникальные ключи внути ключа features
unique_keys = set()
for features in df_transposed['features']:
    unique_keys.update(features.keys())

# создадим из них новые колонки извлекая данные из 'features'
for key in unique_keys:
    df_transposed[key] = df_transposed['features'].apply(lambda x: x.get(key))

# колонку 'features' удалим за дальнейшей ненадобностью
df_transposed.drop('features', axis=1, inplace=True)

# посмотрим что получилось
df_transposed.head()


Unnamed: 0,user_id,target,exchange-sessions,orders,visits,site-meta,last-visits-in-categories
0,user_1,female,,"[{'site-id': 1, 'orders': [{'created-at': 1634...",,[{'site-id': 2}],
1,user_2,female,,,"[{'site-id': 3, 'first-seen': 1696277805, 'las...","[{'site-id': 4}, {'site-id': 5}, {'site-id': 6...","[{'category': 'other', 'last-visit-at': 169627..."
2,user_3,male,,"[{'site-id': 21, 'orders': [{'created-at': 168...","[{'site-id': 3, 'first-seen': 1696320204, 'las...","[{'site-id': 24}, {'site-id': 25}, {'site-id':...","[{'category': 'electronics', 'last-visit-at': ..."
3,user_4,male,,"[{'site-id': 2, 'orders': [{'created-at': 1600...","[{'site-id': 3, 'first-seen': 1693468067, 'las...","[{'site-id': 33}, {'site-id': 2}, {'site-id': ...","[{'category': 'hypermarket', 'last-visit-at': ..."
4,user_5,female,,"[{'site-id': 39, 'orders': [{'created-at': 164...","[{'site-id': 3, 'first-seen': 1693796766, 'las...","[{'site-id': 42}, {'site-id': 43}, {'site-id':...","[{'category': 'insurance', 'last-visit-at': 16..."


In [7]:
# Функция рассчитывает определенные айди сайтов в колонке заказы
def count_site_ids_in_orders_modified(orders):
    site_id_counter = Counter()
    if orders is not None:
        for order_dict in orders:
            if 'orders' in order_dict and 'site-id' in order_dict:
                site_id = order_dict['site-id']
                for order in order_dict['orders']:
                    site_id_counter[site_id] += 1
    return site_id_counter

In [8]:

# Создадим новую колонку в которой будет количество заказов на каждом сайте для каждого пользователя
df_transposed['orders_count'] = df_transposed['orders'].apply(count_site_ids_in_orders_modified)

# посмотрим что получилось
df_transposed[['user_id', 'orders_count']].head()


Unnamed: 0,user_id,orders_count
0,user_1,"{1: 1, 2: 2}"
1,user_2,{}
2,user_3,"{21: 1, 22: 1}"
3,user_4,"{2: 1, 29: 2, 30: 4, 21: 1}"
4,user_5,"{39: 1, 40: 1, 20: 1}"


In [9]:
# теперь тоже самое сделаем и для колонки с посещениями - посмотрим куда человек ходит и как часто
def count_site_ids_in_visits_modified(visits):
    site_id_counter = Counter()
    if visits is not None:
        for visit_group in visits:
            if 'visits' in visit_group and 'site-id' in visit_group:
                site_id = visit_group['site-id']
                for visit in visit_group['visits']:
                    site_id_counter[site_id] += 1
    return site_id_counter

In [10]:

# Применим функцию и создадим новый столбец
df_transposed['visits_count'] = df_transposed['visits'].apply(count_site_ids_in_visits_modified)

# посмотрим что получилось
df_transposed[['user_id', 'visits_count']].head()


Unnamed: 0,user_id,visits_count
0,user_1,{}
1,user_2,{3: 2}
2,user_3,"{3: 1, 23: 3, 16: 10}"
3,user_4,"{3: 2, 21: 1, 20: 4, 31: 1, 30: 6, 32: 3}"
4,user_5,"{3: 3, 23: 1, 13: 7, 41: 2, 20: 10}"


In [11]:
# теперь для каждого пользвателя мы хотим вытащить последние категории посещений
def extract_categories(last_visits):
    if last_visits is not None:
        return [visit['category'] for visit in last_visits]
    return []

In [12]:
# применим функцию и создадим новый столбец с требуемым списком
df_transposed['last_visited_categories'] = df_transposed['last-visits-in-categories'].apply(extract_categories)

In [13]:
# Checking the result for the first few users
df_transposed[['user_id', 'last_visited_categories']].head()

Unnamed: 0,user_id,last_visited_categories
0,user_1,[]
1,user_2,[other]
2,user_3,"[electronics, other]"
3,user_4,"[hypermarket, other, education]"
4,user_5,"[insurance, fashion, other, bank, pets, electr..."


In [14]:

# посмотрим на на результат нашего features engeneering
df_transposed[['user_id', 'orders_count', 'visits_count', 'last_visited_categories']].head()


Unnamed: 0,user_id,orders_count,visits_count,last_visited_categories
0,user_1,"{1: 1, 2: 2}",{},[]
1,user_2,{},{3: 2},[other]
2,user_3,"{21: 1, 22: 1}","{3: 1, 23: 3, 16: 10}","[electronics, other]"
3,user_4,"{2: 1, 29: 2, 30: 4, 21: 1}","{3: 2, 21: 1, 20: 4, 31: 1, 30: 6, 32: 3}","[hypermarket, other, education]"
4,user_5,"{39: 1, 40: 1, 20: 1}","{3: 3, 23: 1, 13: 7, 41: 2, 20: 10}","[insurance, fashion, other, bank, pets, electr..."


In [16]:
# Инициализация словарей для подсчета мужественности/женственности сайтов и категорий
site_stats_simplified = {}
category_stats = {}

In [17]:
# Коэффициент для учета важности заказов (думаю если заказ важнее посещения - то мы можем накрутить этот множитель)
k = 1

In [18]:

# Проход по каждой строке DataFrame с целью вытащить количество посещений кажого сайта и покупки 
for _, row in df_transposed.iterrows():
    gender = row['target']
    
    # Обработка orders_count
    for site_id, count in row['orders_count'].items():
        if site_id not in site_stats_simplified:
            site_stats_simplified[site_id] = {'male': 0, 'female': 0}
        site_stats_simplified[site_id][gender] += count * k

    # Обработка visits_count
    for site_id, count in row['visits_count'].items():
        if site_id not in site_stats_simplified:
            site_stats_simplified[site_id] = {'male': 0, 'female': 0}
        site_stats_simplified[site_id][gender] += count

    # Обработка last_visited_categories
    for category in row['last_visited_categories']:
        if category not in category_stats:
            category_stats[category] = {'male': 0, 'female': 0}
        category_stats[category][gender] += 1

# Теперь у нас есть словарь site_stats_simplified и category_stats с подробной статистикой по каждому сайту категорией


In [19]:
category_stats

{'other': {'male': 62416, 'female': 63176},
 'electronics': {'male': 35223, 'female': 30532},
 'hypermarket': {'male': 30217, 'female': 28210},
 'education': {'male': 13687, 'female': 16964},
 'insurance': {'male': 6779, 'female': 5571},
 'fashion': {'male': 16852, 'female': 24432},
 'bank': {'male': 28305, 'female': 22006},
 'pets': {'male': 4366, 'female': 8980},
 'cosmetics': {'male': 13621, 'female': 25971},
 'entertainment_services': {'male': 21604, 'female': 26139},
 'health': {'male': 9191, 'female': 15316},
 'media': {'male': 20004, 'female': 19843},
 'furniture': {'male': 12599, 'female': 17579},
 'shoes': {'male': 4811, 'female': 8511},
 'travel': {'male': 9240, 'female': 12407},
 'household_appliances': {'male': 9481, 'female': 6840},
 'food': {'male': 15524, 'female': 17601},
 'kids': {'male': 6976, 'female': 13786},
 'luxury': {'male': 6532, 'female': 9509},
 'gifts': {'male': 3764, 'female': 6847},
 'dealoftheday': {'male': 2696, 'female': 3614},
 'sport': {'male': 3085, 

'"0" => "f", "1" => "a", "2" => "s", "3" => "h", "4" => "i", "5" => "o", "6" => "n"': {'male': 0,
  'female': 3} - вот это непонятно что надо на нее посмотреть будет

In [20]:
site_stats_simplified

{1: {'male': 2489, 'female': 1633},
 2: {'male': 82366, 'female': 54713},
 3: {'male': 181589, 'female': 195597},
 21: {'male': 35072, 'female': 34783},
 22: {'male': 10033, 'female': 8784},
 23: {'male': 44805, 'female': 17430},
 16: {'male': 92326, 'female': 70431},
 29: {'male': 77863, 'female': 62013},
 30: {'male': 25389, 'female': 7149},
 20: {'male': 146294, 'female': 112170},
 31: {'male': 5783, 'female': 11915},
 32: {'male': 1581, 'female': 2080},
 39: {'male': 20577, 'female': 17668},
 40: {'male': 26084, 'female': 16446},
 13: {'male': 33934, 'female': 41378},
 41: {'male': 48128, 'female': 34268},
 123: {'male': 786, 'female': 3940},
 50: {'male': 6834, 'female': 15153},
 150: {'male': 1175, 'female': 2260},
 146: {'male': 1227, 'female': 14673},
 151: {'male': 12338, 'female': 3308},
 205: {'male': 3294, 'female': 7947},
 206: {'male': 21885, 'female': 14768},
 63: {'male': 2012, 'female': 3016},
 207: {'male': 4927, 'female': 4228},
 208: {'male': 8424, 'female': 17880},

In [None]:
# Словари для определения преобладающего пола
gender_dominant_sites = {site_id: 'female' if stats['female'] >= stats['male'] else 'male'
                         for site_id, stats in site_stats_simplified.items()}
gender_dominant_categories = {category: 'female' if stats['female'] >= stats['male'] else 'male'
                              for category, stats in category_stats.items()}

!!! вот эти словари я еще не пробовал для прогноза, надо будет тоже попробовать - может чем проще тем лучше

In [21]:
# Словари для расчета доли мужского и женского участия
gender_ratio_sites = {}
for site_id, stats in site_stats_simplified.items():
    total = stats['male'] + stats['female']
    if total > 0:
        gender_ratio_sites[site_id] = {'male': round(stats['male'] / total, 2), 'female': round(stats['female'] / total, 2)}

gender_ratio_categories = {}
for category, stats in category_stats.items():
    total = stats['male'] + stats['female']
    if total > 0:
        gender_ratio_categories[category] = {'male': round(stats['male'] / total, 2), 'female': round(stats['female'] / total, 2)}

In [41]:
# Результаты
gender_dominant_sites


{1: 'male',
 2: 'male',
 3: 'female',
 21: 'male',
 22: 'male',
 23: 'male',
 16: 'male',
 29: 'male',
 30: 'male',
 20: 'male',
 31: 'female',
 32: 'female',
 39: 'male',
 40: 'male',
 13: 'female',
 41: 'male',
 123: 'female',
 50: 'female',
 150: 'female',
 146: 'female',
 151: 'male',
 205: 'female',
 206: 'male',
 63: 'female',
 207: 'male',
 208: 'female',
 209: 'female',
 225: 'female',
 247: 'female',
 35: 'male',
 155: 'male',
 147: 'male',
 263: 'female',
 10: 'female',
 241: 'male',
 196: 'female',
 62: 'female',
 307: 'female',
 236: 'female',
 175: 'female',
 46: 'female',
 246: 'female',
 348: 'male',
 12: 'female',
 65: 'female',
 49: 'female',
 171: 'female',
 129: 'female',
 477: 'female',
 389: 'male',
 191: 'female',
 320: 'female',
 217: 'male',
 478: 'male',
 149: 'female',
 160: 'male',
 179: 'female',
 25: 'male',
 74: 'male',
 325: 'female',
 71: 'female',
 58: 'female',
 158: 'female',
 34: 'female',
 351: 'female',
 26: 'male',
 112: 'male',
 98: 'female',
 11

In [24]:
gender_dominant_categories

{'other': 'female',
 'electronics': 'male',
 'hypermarket': 'male',
 'education': 'female',
 'insurance': 'male',
 'fashion': 'female',
 'bank': 'male',
 'pets': 'female',
 'cosmetics': 'female',
 'entertainment_services': 'female',
 'health': 'female',
 'media': 'male',
 'furniture': 'female',
 'shoes': 'female',
 'travel': 'female',
 'household_appliances': 'male',
 'food': 'female',
 'kids': 'female',
 'luxury': 'female',
 'gifts': 'female',
 'dealoftheday': 'female',
 'sport': 'female',
 'software': 'male',
 '"0" => "f", "1" => "a", "2" => "s", "3" => "h", "4" => "i", "5" => "o", "6" => "n"': 'female'}

In [25]:
gender_ratio_sites

{1: {'male': 0.6, 'female': 0.4},
 2: {'male': 0.6, 'female': 0.4},
 3: {'male': 0.48, 'female': 0.52},
 21: {'male': 0.5, 'female': 0.5},
 22: {'male': 0.53, 'female': 0.47},
 23: {'male': 0.72, 'female': 0.28},
 16: {'male': 0.57, 'female': 0.43},
 29: {'male': 0.56, 'female': 0.44},
 30: {'male': 0.78, 'female': 0.22},
 20: {'male': 0.57, 'female': 0.43},
 31: {'male': 0.33, 'female': 0.67},
 32: {'male': 0.43, 'female': 0.57},
 39: {'male': 0.54, 'female': 0.46},
 40: {'male': 0.61, 'female': 0.39},
 13: {'male': 0.45, 'female': 0.55},
 41: {'male': 0.58, 'female': 0.42},
 123: {'male': 0.17, 'female': 0.83},
 50: {'male': 0.31, 'female': 0.69},
 150: {'male': 0.34, 'female': 0.66},
 146: {'male': 0.08, 'female': 0.92},
 151: {'male': 0.79, 'female': 0.21},
 205: {'male': 0.29, 'female': 0.71},
 206: {'male': 0.6, 'female': 0.4},
 63: {'male': 0.4, 'female': 0.6},
 207: {'male': 0.54, 'female': 0.46},
 208: {'male': 0.32, 'female': 0.68},
 209: {'male': 0.28, 'female': 0.72},
 225:

In [26]:
gender_ratio_categories

{'other': {'male': 0.5, 'female': 0.5},
 'electronics': {'male': 0.54, 'female': 0.46},
 'hypermarket': {'male': 0.52, 'female': 0.48},
 'education': {'male': 0.45, 'female': 0.55},
 'insurance': {'male': 0.55, 'female': 0.45},
 'fashion': {'male': 0.41, 'female': 0.59},
 'bank': {'male': 0.56, 'female': 0.44},
 'pets': {'male': 0.33, 'female': 0.67},
 'cosmetics': {'male': 0.34, 'female': 0.66},
 'entertainment_services': {'male': 0.45, 'female': 0.55},
 'health': {'male': 0.38, 'female': 0.62},
 'media': {'male': 0.5, 'female': 0.5},
 'furniture': {'male': 0.42, 'female': 0.58},
 'shoes': {'male': 0.36, 'female': 0.64},
 'travel': {'male': 0.43, 'female': 0.57},
 'household_appliances': {'male': 0.58, 'female': 0.42},
 'food': {'male': 0.47, 'female': 0.53},
 'kids': {'male': 0.34, 'female': 0.66},
 'luxury': {'male': 0.41, 'female': 0.59},
 'gifts': {'male': 0.35, 'female': 0.65},
 'dealoftheday': {'male': 0.43, 'female': 0.57},
 'sport': {'male': 0.43, 'female': 0.57},
 'software':

В целом на первый взгляд есть в этих правилах вполне понятная логика

## Приступаем к работе над валидационным набором данных

In [27]:
df_val = pd.read_json('val.json')

Тут просто копировал ячейки и менял название датафрейма, описание можно потом поменять ну и вообще обернуть это все в функции

In [28]:
# транспонируем датасет что бы он принял человеческий вид
df_transposed_val = df_val.T

# зададим имена колонок (важно когда будем работать с тестовым набором то колонки таргет у нас не будет)
df_transposed_val.reset_index(inplace=True)
df_transposed_val.rename(columns={'index': 'user_id', 0: 'target', 1: 'features'}, inplace=True)

# Посмотрим что получилось
df_transposed_val.head()

Unnamed: 0,user_id,target,features
0,user_127756,female,"{'orders': [{'site-id': 407, 'orders': [{'crea..."
1,user_127757,male,"{'orders': [{'site-id': 16, 'orders': [{'creat..."
2,user_127758,female,"{'orders': [{'site-id': 149, 'orders': [{'crea..."
3,user_127759,female,"{'orders': [{'site-id': 93, 'orders': [{'creat..."
4,user_127760,female,"{'visits': [{'site-id': 3, 'first-seen': 16836..."


In [29]:
# найдем все унникальные ключи внути ключа features
unique_keys = set()
for features in df_transposed_val['features']:
    unique_keys.update(features.keys())

# создадим из них новые колонки извлекая данные из 'features'
for key in unique_keys:
    df_transposed_val[key] = df_transposed_val['features'].apply(lambda x: x.get(key))

# колонку 'features' удалим за дальнейшей ненадобностью
df_transposed_val.drop('features', axis=1, inplace=True)

# посмотрим что получилось
df_transposed_val.head()

Unnamed: 0,user_id,target,exchange-sessions,orders,visits,site-meta,last-visits-in-categories
0,user_127756,female,"[{'landed-at': 1697132982, 'sites': [169, 214,...","[{'site-id': 407, 'orders': [{'created-at': 16...","[{'site-id': 3, 'first-seen': 1697125590, 'las...",[{'site-id': 407}],"[{'category': 'other', 'last-visit-at': 169713..."
1,user_127757,male,"[{'landed-at': 1693508972, 'sites': [2, 169, 7...","[{'site-id': 16, 'orders': [{'created-at': 163...","[{'site-id': 3, 'first-seen': 1698423484, 'las...","[{'site-id': 16, 'recency': 1, 'frequency': 5,...","[{'category': 'electronics', 'last-visit-at': ..."
2,user_127758,female,,"[{'site-id': 149, 'orders': [{'created-at': 16...","[{'site-id': 391, 'first-seen': 1685615371, 'l...","[{'site-id': 391}, {'site-id': 42}, {'site-id'...","[{'category': 'sport', 'last-visit-at': 165060..."
3,user_127759,female,"[{'landed-at': 1697791749, 'sites': [373, 169,...","[{'site-id': 93, 'orders': [{'created-at': 162...","[{'site-id': 495, 'first-seen': 1687325784, 'l...","[{'site-id': 93, 'recency': 5, 'frequency': 2,...","[{'category': 'furniture', 'last-visit-at': 16..."
4,user_127760,female,,,"[{'site-id': 3, 'first-seen': 1683626530, 'las...","[{'site-id': 46}, {'site-id': 29}, {'site-id':...","[{'category': 'other', 'last-visit-at': 169765..."


In [30]:
# Создадим новую колонку в которой будет количество заказов на каждом сайте для каждого пользователя
df_transposed_val['orders_count'] = df_transposed_val['orders'].apply(count_site_ids_in_orders_modified)

# посмотрим что получилось
df_transposed_val[['user_id', 'orders_count']].head()

Unnamed: 0,user_id,orders_count
0,user_127756,{407: 1}
1,user_127757,"{16: 10, 40: 1, 288: 1, 21: 2}"
2,user_127758,"{149: 2, 25: 1, 21: 2, 100: 1, 474: 1}"
3,user_127759,"{93: 8, 134: 1, 103: 1}"
4,user_127760,{}


In [31]:
# Применим функцию и создадим новый столбец
df_transposed_val['visits_count'] = df_transposed_val['visits'].apply(count_site_ids_in_visits_modified)

# посмотрим что получилось
df_transposed_val[['user_id', 'visits_count']].head()

Unnamed: 0,user_id,visits_count
0,user_127756,"{3: 2, 407: 10}"
1,user_127757,"{3: 1, 308: 1, 16: 1}"
2,user_127758,"{391: 1, 42: 7, 485: 1, 820: 1, 97: 5, 39: 1, ..."
3,user_127759,"{495: 1, 34: 9, 484: 1, 334: 1, 205: 1, 31: 1,..."
4,user_127760,"{3: 2, 34: 1}"


In [33]:
# применим функцию и создадим новый столбец с требуемым списком
df_transposed_val['last_visited_categories'] = df_transposed_val['last-visits-in-categories'].apply(extract_categories)
# Checking the result for the first few users
df_transposed_val[['user_id', 'last_visited_categories']].head()

Unnamed: 0,user_id,last_visited_categories
0,user_127756,"[other, education]"
1,user_127757,"[electronics, other, shoes]"
2,user_127758,"[sport, furniture, fashion, other, bank, house..."
3,user_127759,"[furniture, fashion, other, food, electronics,..."
4,user_127760,"[other, cosmetics]"


In [34]:
# посмотрим на на результат нашего features engeneering
df_transposed_val[['user_id', 'orders_count', 'visits_count', 'last_visited_categories']].head()

Unnamed: 0,user_id,orders_count,visits_count,last_visited_categories
0,user_127756,{407: 1},"{3: 2, 407: 10}","[other, education]"
1,user_127757,"{16: 10, 40: 1, 288: 1, 21: 2}","{3: 1, 308: 1, 16: 1}","[electronics, other, shoes]"
2,user_127758,"{149: 2, 25: 1, 21: 2, 100: 1, 474: 1}","{391: 1, 42: 7, 485: 1, 820: 1, 97: 5, 39: 1, ...","[sport, furniture, fashion, other, bank, house..."
3,user_127759,"{93: 8, 134: 1, 103: 1}","{495: 1, 34: 9, 484: 1, 334: 1, 205: 1, 31: 1,...","[furniture, fashion, other, food, electronics,..."
4,user_127760,{},"{3: 2, 34: 1}","[other, cosmetics]"


# Переходим к предсказаниям

Вот с этой функцией можно поиграть что то отключить,  а может и что то добавить.

In [35]:
# Создание столбца predict_gender
def predict_gender(row):
    male_score = 0
    female_score = 0

    # Подсчет для orders
    for site_id, count in row['orders_count'].items():
        if site_id in gender_ratio_sites:
            male_score += gender_ratio_sites[site_id]['male'] * count
            female_score += gender_ratio_sites[site_id]['female'] * count

    # Подсчет для visits
    for site_id, count in row['visits_count'].items():
        if site_id in gender_ratio_sites:
            male_score += gender_ratio_sites[site_id]['male'] * count
            female_score += gender_ratio_sites[site_id]['female'] * count

    # Подсчет для categories
    for category in row['last_visited_categories']:
        if category in gender_ratio_categories:
            male_score += gender_ratio_categories[category]['male']
            female_score += gender_ratio_categories[category]['female']

    return 'female' if female_score >= male_score else 'male'

In [None]:
# Применение функции к DataFrame
df_transposed_val['predict_gender'] = df_transposed_val.apply(predict_gender, axis=1)

In [38]:
# Результаты
df_transposed_val[['user_id', 'target','predict_gender']]

Unnamed: 0,user_id,target,predict_gender
0,user_127756,female,female
1,user_127757,male,male
2,user_127758,female,female
3,user_127759,female,female
4,user_127760,female,female
...,...,...,...
27442,user_155198,female,female
27443,user_155199,male,female
27444,user_155200,female,female
27445,user_155201,male,female


Нужно будет внимательно посмотреть на пользователей по которым мы ошиблись

In [39]:
def calculate_accuracy(df):
    correct_predictions = df[df['target'] == df['predict_gender']].shape[0]
    total_predictions = df.shape[0]
    accuracy = correct_predictions / total_predictions
    return accuracy


In [40]:
# Вызов функции для расчета accuracy
accuracy = calculate_accuracy(df_transposed_val)
accuracy

0.7164717455459613

# Анализ ошибок

In [43]:
# Создание DataFrame с ошибочными предсказаниями
df_errors = df_transposed_val[df_transposed_val['target'] != df_transposed_val['predict_gender']]

In [44]:
# Вывод первых нескольких строк DataFrame с ошибочными предсказаниями
df_errors.head()

Unnamed: 0,user_id,target,exchange-sessions,orders,visits,site-meta,last-visits-in-categories,orders_count,visits_count,last_visited_categories,predict_gender
8,user_127764,male,,"[{'site-id': 179, 'orders': [{'created-at': 13...","[{'site-id': 3, 'first-seen': 1690804257, 'las...","[{'site-id': 179, 'recency': 1, 'frequency': 2...","[{'category': 'fashion', 'last-visit-at': 1685...","{179: 2, 40: 1}","{3: 2, 13: 1, 2: 1, 208: 2, 85: 2, 431: 6, 7: 1}","[fashion, other, bank, household_appliances, f...",female
9,user_127765,female,,"[{'site-id': 16, 'orders': [{'created-at': 156...","[{'site-id': 3, 'first-seen': 1684519666, 'las...","[{'site-id': 2}, {'site-id': 16, 'recency': 1,...","[{'category': 'other', 'last-visit-at': 169651...","{16: 1, 2: 6, 691: 2}",{3: 4},[other],male
14,user_127770,female,,"[{'site-id': 39, 'orders': [{'created-at': 167...","[{'site-id': 13, 'first-seen': 1684836326, 'la...","[{'site-id': 2}, {'site-id': 39, 'recency': 1,...","[{'category': 'bank', 'last-visit-at': 1674136...","{39: 1, 244: 1, 1: 1}","{13: 1, 3: 1}","[bank, electronics, other, media, cosmetics, f...",male
15,user_127771,male,"[{'landed-at': 1698678911, 'sites': [169, 2, 2...","[{'site-id': 62, 'orders': [{'created-at': 168...","[{'site-id': 34, 'first-seen': 1685510491, 'la...","[{'site-id': 42}, {'site-id': 33}, {'site-id':...","[{'category': 'fashion', 'last-visit-at': 1698...","{62: 1, 40: 1, 209: 2, 29: 5, 74: 1}","{34: 2, 135: 2, 149: 1, 26: 4, 110: 2, 713: 1,...","[fashion, other, bank, electronics, cosmetics,...",female
20,user_127776,male,"[{'landed-at': 1697833017, 'sites': [2, 169, 7...","[{'site-id': 49, 'orders': [{'created-at': 150...","[{'site-id': 912, 'first-seen': 1688017479, 'l...","[{'site-id': 495}, {'site-id': 49, 'recency': ...","[{'category': 'other', 'last-visit-at': 169783...","{49: 10, 191: 1, 34: 1, 112: 6, 21: 1, 39: 1, ...","{912: 2, 495: 5, 66: 1, 178: 1, 357: 1, 51: 1,...","[other, bank, travel, pets, food, electronics,...",female
