In [None]:
#import library
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from pathlib import Path
from tqdm import tqdm
import pickle
from collections import defaultdict
from typing import List, Dict, Any, Union
from datetime import datetime
from datetime import timedelta

In [None]:
# get data from Kagle
transactions_df = pd.read_csv('/content/transactions_train.csv')
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
customers_df = pd.read_csv('/content/customers.csv')
articles_df = pd.read_csv('/content/articles.csv')

Mounted at /content/gdrive/


In [None]:
# data cleaning
articles_df['detail_desc']= articles_df['detail_desc'].fillna("None")
customers_df.loc[~customers_df['fashion_news_frequency'].isin(['Regularly', 'Monthly']), 'fashion_news_frequency'] = 'None'
customers_df['FN'] = customers_df['Active'].fillna(0.0)
customers_df['Active'] = customers_df['Active'].fillna(0.0)
customers_df['club_member_status'] = customers_df['club_member_status'].fillna("LEFT CLUB")
customers_df['age'] = customers_df['age'].fillna(customers_df['age'].median())


#create extract function

In [None]:
class UserFeatures(ABC):
    @abstractmethod
    def get(self) -> pd.DataFrame:
        """
        customer_id -> features
        """
        pass

In [None]:
class AggrFeatures(UserFeatures):
    def __init__(self, transactions_df):
        self.groupby_df = transactions_df.groupby('customer_id', as_index = False)
    def get(self):
        output_df = (
            self.groupby_df['price']
            .agg({
                'mean_transactions': 'mean',
                'max_transactions': 'max',
                'min_transactions': 'min',
                'median_transactions': 'median',
                'sum_transactions': 'sum',
                'max_minus_min_transactions': lambda x: x.max()-x.min()
            })
            .set_index('customer_id')
            .astype('float32')
        )
        return output_df

In [None]:
class CountFeatures(UserFeatures):
    def __init__(self, transactions_df, topk = 10):
        self.transactions_df = transactions_df
        self.topk = topk
    def get(self):
        grouped = self.transactions_df.groupby('customer_id', as_index = False)
        a = (
            grouped
            .agg({
                'article_id': 'count',
                'price': lambda x: sum(np.array(x) > x.mean()),
                'sales_channel_id': lambda x: sum(x == 2),
            })
            .rename(columns = {
                'article_id': 'n_transactions',
                'price': 'n_transactions_bigger_mean',
                'sales_channel_id': 'n_online_articles'
            })
            .set_index('customer_id')
            .astype('int8')
        )

        b = (
            grouped
            .agg({
                'article_id': 'nunique',
                'sales_channel_id': lambda x: sum(x == 1),
            })
            .rename(columns = {
                'article_id': 'n_unique_articles',
                'sales_channel_id': 'n_store_articles',
            })
            .set_index('customer_id')
            .astype('int8')
        )
        topk_articles = self.transactions_df['article_id'].value_counts()[:self.topk].index
        c = (
            grouped['article_id']
            .agg({
               f'top_article_{i}':  lambda x: sum(x == k) for i, k in enumerate(topk_articles)
            }
            )
            .set_index('customer_id')
            .astype('int8')
        )
        output_df = a.merge(b, on = ('customer_id')).merge(c, on = ('customer_id'))
        return output_df

In [None]:
class CustomerFeatures(UserFeatures):
    """
    Nhận customers_df và thực hiện một số biến đổi trước khi sử dụng dữ liệu khách hàng.
    """
    def __init__(self, customers_df):
        self.customers_df = self._prepare_customers(customers_df)

    def _prepare_customers(self, customers_df):
        customers_df['FN'] = customers_df['FN'].fillna(0).astype('int8')
        customers_df['Active'] = customers_df['Active'].fillna(0).astype('int8')
        customers_df['club_member_status'] = customers_df['club_member_status'].fillna('UNKNOWN')
        customers_df['age'] = customers_df['age'].fillna(customers_df['age'].mean()).astype('int8')
        customers_df['fashion_news_frequency'] = (
            customers_df['fashion_news_frequency']
            .replace('None', 'NONE')
            .replace(np.nan, 'NONE')
        )
        return customers_df
    '''
    Triển khai phương thức get
    output: DataFrame chứa tất cả các đặc trưng khách hàng (trừ cột 'postal_code') được nhóm theo 'customer_id'
    '''

    def get(self):
        output = (
            self.customers_df[filter(lambda x: x != 'postal_code', customers_df.columns)]
            .set_index('customer_id')
        )
        return output

In [None]:
class ArticlesFeatures(UserFeatures):
    """
    returns article features: whether category appears in top categories

    """
    def __init__(self, transactions_df, articles, topk = 10):
        self.merged_df = transactions_df.merge(articles, on = ('article_id'))
        self.articles = articles
        self.topk = topk

    def get(self):
        output_df = None
        for col in tqdm(self.articles.columns, desc = 'extracting features'):
            if 'name' in col:
                if output_df is None:
                    output_df = self.aggregate_topk(self.merged_df, col, self.topk)
                else:
                    intermediate_out = self.aggregate_topk(self.merged_df, col, self.topk)
                    output_df = output_df.merge(intermediate_out, on = ('customer_id'))
        return output_df

    def return_value_counts(self, df, column_name, k):
        value_counts = df[column_name].value_counts()[:k].index
        value_counts = list(map(lambda x: x[1], value_counts))
        return value_counts
    def aggregate_topk(self, merged_df, column_name, k):
        grouped_df_indx = merged_df.groupby('customer_id')
        grouped_df = merged_df.groupby('customer_id', as_index = False)

        topk_values = self.return_value_counts(grouped_df_indx, column_name, k)
        n_top_k = (
            grouped_df[column_name]
            .agg({
                f'top_{column_name}_{i}': lambda x: sum(x == k) for i, k in enumerate(topk_values)
            })
            .set_index('customer_id')
            .astype('int16')
        )
        return n_top_k

In [None]:
class UserFeaturesCollector:
    """
    collect all features and aggregate them
    """
    @staticmethod
    def collect(features: Union[List[UserFeatures], List[str]], **kwargs) -> pd.DataFrame:
        output_df = None
        for feature in tqdm(features):
            if isinstance(feature, UserFeatures):
                feature_out = feature.get(**kwargs)
            if isinstance(feature, str):
                try:
                    feature_out = pd.read_csv(feature)
                except:
                    feature_out = pd.read_parquet(feature)
            if output_df is None:
                output_df = feature_out
            else:
                output_df = output_df.merge(feature_out, on = ('customer_id'))
        return output_df

In [None]:
### given age bins & season
def _add_season_column(transaction_df):
    conditions = [
        (
            transaction_df["t_dat"].between(
                datetime(2019, 3, 1), datetime(2019, 5, 31)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 3, 1), datetime(2020, 5, 31)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2019, 6, 1), datetime(2019, 8, 31)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 6, 1), datetime(2020, 8, 31)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2018, 9, 1), datetime(2018, 11, 30)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2019, 9, 1), datetime(2019, 11, 30)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 9, 1), datetime(2020, 11, 30)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2018, 12, 1), datetime(2019, 2, 28)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2019, 12, 1), datetime(2020, 2, 29)
            )
        ),
    ]
    choices = ["spring", "summer", "fall", "winter"]
    transaction_df["season"] = np.select(conditions, choices)
    return transaction_df

In [None]:
customers_df_with_age_bins = customers_df.copy()
customers_df_with_age_bins['age_bins'] = pd.cut(customers_df_with_age_bins['age'], [-1, 19, 29, 39, 49, 69, 119])
customers_df_with_age_bins['age_bins'] = customers_df_with_age_bins['age_bins'].astype(str)
transactions_df_with_season = _add_season_column(transactions_df)

In [None]:
# feature extraction
seasons = list(transactions_df_with_season['season'].unique())
age_bins = list(customers_df_with_age_bins['age_bins'].unique())
for season in tqdm(seasons):
    for age_bin in tqdm(age_bins):
        tmp_trans = transactions_df_with_season[transactions_df_with_season['season'] == season].copy()
        tmp_trans = tmp_trans.drop(columns=['season'])

        tmp_cus = customers_df_with_age_bins[customers_df_with_age_bins['age_bins'] == age_bin].copy()
        tmp_cus = tmp_cus.drop(columns=['age_bins'])
        user_features = UserFeaturesCollector.collect([
            AggrFeatures(tmp_trans),
            CountFeatures(tmp_trans, 3),
            CustomerFeatures(tmp_cus),
            ArticlesFeatures(tmp_trans, articles_df, 3)
        ])
        user_features.to_parquet(f"/content/user_{season}_{age_bin}_features.parquet")


  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A[A

 25%|██▌       | 1/4 [00:47<02:21, 47.17s/it][A[A

 50%|█████     | 2/4 [05:17<05:57, 178.73s/it][A[A

 75%|███████▌  | 3/4 [05:18<01:37, 97.36s/it] [A[A


extracting features:   0%|          | 0/25 [00:00<?, ?it/s][A[A[A


extracting features:  12%|█▏        | 3/25 [02:16<16:44, 45.64s/it][A[A[A


extracting features:  20%|██        | 5/25 [04:38<19:16, 57.82s/it][A[A[A


extracting features:  24%|██▍       | 6/25 [06:46<24:05, 76.09s/it][A[A[A


extracting features:  32%|███▏      | 8/25 [08:57<20:15, 71.53s/it][A[A[A


extracting features:  40%|████      | 10/25 [11:06<17:14, 68.99s/it][A[A[A


extracting features:  48%|████▊     | 12/25 [13:15<14:35, 67.36s/it][A[A[A


extracting features:  56%|█████▌    | 14/25 [15:27<12:16, 66.94s/it][A[A[A


extracting features:  64%|██████▍   | 16/25 [17:37<09:55, 66.19s/it][A[A[A


extrac