In [None]:
"""
- 将包括article_id、category_id在内的所有类别转换为以0为索引的顺序号（带有_idx的列被添加）。
- 将只有None、1的类别转换为0、1（列被覆盖）。
- 将只有1，2的类别转换为0，1（列被覆盖）。
"""
import shutil
from pathlib import Path
from typing import Any
import pandas as pd
from logzero import logger
from tqdm.auto import tqdm

# 数据格式
ARTICLES_ORIGINAL = {
    'article_id': 'object',
    'product_code': 'int64',
    'prod_name': 'object',
    'product_type_no': 'int64',
    'product_type_name': 'object',
    'product_group_name': 'object',
    'graphical_appearance_no': 'int64',
    'graphical_appearance_name': 'object',
    'colour_group_code': 'int64',
    'colour_group_name': 'object',
    'perceived_colour_value_id': 'int64',
    'perceived_colour_value_name': 'object',
    'perceived_colour_master_id': 'int64',
    'perceived_colour_master_name': 'object',
    'department_no': 'int64',
    'department_name': 'object',
    'index_code': 'object',
    'index_name': 'object',
    'index_group_no': 'int64',
    'index_group_name': 'object',
    'section_no': 'int64',
    'section_name': 'object',
    'garment_group_no': 'int64',
    'garment_group_name': 'object',
    'detail_desc': 'object',
}

CUSTOMERS_ORIGINAL = {
    'customer_id': 'object',
    'FN': 'float64',
    'Active': 'float64',
    'club_member_status': 'object',
    'fashion_news_frequency': 'object',
    'age': 'float64',
    'postal_code': 'object',
}

TRANSACTIONS_ORIGINAL = {
    'customer_id': 'object',
    'article_id': 'object',
    'price': 'float64',
    'sales_channel_id': 'int64',
}

data_dir = '/home/xuming/workspace/h-and-m-personalized-fashion-recommendations'

def _count_encoding_dict(df, col_name):
    '''
    根据列的value count排序，并生成{value0:0, value1:1,}, 以便后续标签编码
    '''
    v = df.groupby(col_name).size().reset_index(name='size').sort_values(by='size', ascending=False)[col_name].tolist()

    return {x: i for i, x in enumerate(v)}

def _dict_to_dataframe(mp):
    '''
    字典转DataFrame
    '''
    return pd.DataFrame(mp.items(), columns=['val', 'idx'])

def _add_idx_column(df, col_name_from, col_name_to, mp):
    '''
    标签编码
    '''
    df[col_name_to] = df[col_name_from].apply(lambda x: mp[x]).astype('int64')

In [None]:
logger.info("start reading dataframes")
articles = pd.read_csv(f'{data_dir}/articles.csv', dtype=ARTICLES_ORIGINAL) # 读取 articles.csv
customers = pd.read_csv(f'{data_dir}/customers.csv', dtype=CUSTOMERS_ORIGINAL) # 读取 customers.csv
# 读取transactions_train
transactions = pd.read_csv(
    f'{data_dir}/transactions_train.csv',
    dtype=TRANSACTIONS_ORIGINAL,
    parse_dates=['t_dat']# 解析日期特征
)

# customer_id to idx 字典
logger.info("start processing customer_id")
customer_ids = customers.customer_id.values
mp_customer_id = {x: i for i, x in enumerate(customer_ids)} # {customer_ids: idx}
_dict_to_dataframe(mp_customer_id).to_pickle(f'{data_dir}/mp_customer_id.pkl') # 转成dataframe，并保存

# article_id to idx 字典
logger.info("start processing article_id")
article_ids = articles.article_id.values
mp_article_id = {x: i for i, x in enumerate(article_ids)} # {article_ids: idx}
_dict_to_dataframe(mp_article_id).to_pickle(f'{data_dir}/mp_article_id.pkl') # 转成dataframe，并保存

In [None]:
# --------------------------------------------
# 保存customers pkl
# --------------------------------------------
logger.info("start processing customers")
_add_idx_column(customers, 'customer_id', 'user', mp_customer_id) # customer_id 标签编码

# (None, 1) -> (0, 1)
customers['FN'] = customers['FN'].fillna(0).astype('int64') # 填充0
customers['Active'] = customers['Active'].fillna(0).astype('int64') # 填充0

customers['club_member_status'] = customers['club_member_status'].fillna('NULL') # 'NULL' 填充
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NULL') # 'NULL' 填充

# 做标签编码
for col_name in ['club_member_status','fashion_news_frequency',]:
    mp = _count_encoding_dict(customers, col_name)
    _add_idx_column(customers, col_name, f'{col_name}_idx', mp) 
customers.to_pickle(f'{data_dir}/users.pkl') # 保存新的df


# --------------------------------------------
# 保存articles pkl
# --------------------------------------------
logger.info("start processing articles")
_add_idx_column(articles, 'article_id', 'item', mp_article_id) # article_id 标签编码

# 标签编码
count_encoding_columns = [
    'product_type_no',
    'product_group_name',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_code',
    'index_group_no',
    'section_no',
    'garment_group_no',
]
for col_name in count_encoding_columns:
    # 标签编码
    mp = _count_encoding_dict(articles, col_name)
    _add_idx_column(articles, col_name, f'{col_name}_idx', mp)
articles.to_pickle(f'{data_dir}/items.pkl') # 保存新的df


# --------------------------------------------
# 保存transactions pkl
# --------------------------------------------
logger.info("start processing transactions")
# customer_id和article_id 标签编码
_add_idx_column(transactions, 'customer_id', 'user', mp_customer_id) 
_add_idx_column(transactions, 'article_id', 'item', mp_article_id)

transactions['sales_channel_id'] = transactions['sales_channel_id'] - 1  # (1, 2) -> (0, 1)
transactions['week'] = (transactions['t_dat'].max() - transactions['t_dat']).dt.days // 7 # 距离目前n周
transactions['day'] = (transactions['t_dat'].max() - transactions['t_dat']).dt.days # 距离目前n天
transactions.to_pickle(f'{data_dir}/transactions_train.pkl') # 保存新的df