# H&M: Apply minimum sequence length on split files

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../hm_dataset/preprocessed_data/split/hm.train.csv", sep="\t")
validation = pd.read_csv("../hm_dataset/preprocessed_data/split/hm.validation.csv", sep="\t")
test = pd.read_csv("../hm_dataset/preprocessed_data/split/hm.test.csv", sep="\t")

---
### Min sequence length 2

In [3]:
train.groupby(['customer_id']).size().describe(), validation.groupby(['customer_id']).size().describe(), test.groupby(['customer_id']).size().describe()

(count    1.151249e+06
 mean     1.932434e+01
 std      3.069876e+01
 min      1.000000e+00
 25%      3.000000e+00
 50%      9.000000e+00
 75%      2.300000e+01
 max      1.287000e+03
 dtype: float64,
 count    574270.000000
 mean          8.298562
 std          10.252820
 min           1.000000
 25%           2.000000
 50%           5.000000
 75%          10.000000
 max         333.000000
 dtype: float64,
 count    576162.000000
 mean          8.239691
 std          10.197086
 min           1.000000
 25%           2.000000
 50%           5.000000
 75%          10.000000
 max         380.000000
 dtype: float64)

In [4]:
def apply_min_sequence_length(dataset):
    aggregated = dataset.groupby(['customer_id']).size()
    filtered = aggregated.apply(lambda v: v >= 2)
    filtered = filtered.reset_index()
    filtered.columns = ['customer_id', 'min_sequence_bool']
    ids = filtered[filtered['min_sequence_bool']]['customer_id'].tolist()
    dataset = dataset[dataset['customer_id'].isin(ids)].copy()
    return dataset

In [5]:
train = apply_min_sequence_length(train)
validation = apply_min_sequence_length(validation)
test = apply_min_sequence_length(test)

In [6]:
train.groupby(['customer_id']).size().describe(), validation.groupby(['customer_id']).size().describe(), test.groupby(['customer_id']).size().describe()

(count    1.033962e+06
 mean     2.140295e+01
 std      3.173179e+01
 min      2.000000e+00
 25%      4.000000e+00
 50%      1.000000e+01
 75%      2.500000e+01
 max      1.287000e+03
 dtype: float64,
 count    502743.000000
 mean          9.336953
 std          10.555527
 min           2.000000
 25%           3.000000
 50%           6.000000
 75%          11.000000
 max         333.000000
 dtype: float64,
 count    507480.000000
 mean          9.219506
 std          10.488072
 min           2.000000
 25%           3.000000
 50%           6.000000
 75%          11.000000
 max         380.000000
 dtype: float64)

---

In [7]:
train.sort_values(['customer_id', 't_dat'], inplace=True)
validation.sort_values(['customer_id', 't_dat'], inplace=True)
test.sort_values(['customer_id', 't_dat'], inplace=True)

In [8]:
train, validation, test

(                 t_dat                                        customer_id  \
 0         1.545869e+09  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
 1         1.545869e+09  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
 2         1.545869e+09  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
 3         1.556755e+09  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
 4         1.558742e+09  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
 ...                ...                                                ...   
 22247119  1.569629e+09  ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...   
 22247120  1.570320e+09  ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...   
 22247121  1.570320e+09  ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...   
 22247122  1.570320e+09  ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...   
 22247123  1.579651e+09  ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...   
 
           article_id     price  sales_channel_id   FN  Active

In [9]:
train.to_csv(path_or_buf="../hm_dataset/preprocessed_data/split/hm.train.csv", sep="\t", index=False)
validation.to_csv(path_or_buf="../hm_dataset/preprocessed_data/split/hm.validation.csv", sep="\t", index=False)
test.to_csv(path_or_buf="../hm_dataset/preprocessed_data/split/hm.test.csv", sep="\t", index=False)