# Coveo: Apply minimum sequence length on split files

In [1]:
import pandas as pd

### With Pageviews

In [2]:
train = pd.read_csv("../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.train.csv", sep="\t")
validation = pd.read_csv("../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.validation.csv", sep="\t")
test = pd.read_csv("../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.test.csv", sep="\t")

---
### Min sequence length 2

In [3]:
train.groupby(['session_id_hash']).size().describe(), validation.groupby(['session_id_hash']).size().describe(), test.groupby(['session_id_hash']).size().describe()

(count    3.356069e+06
 mean     5.479107e+00
 std      9.314184e+00
 min      1.000000e+00
 25%      1.000000e+00
 50%      2.000000e+00
 75%      6.000000e+00
 max      1.990000e+02
 dtype: float64,
 count    774041.000000
 mean          5.090611
 std           8.765886
 min           1.000000
 25%           1.000000
 50%           2.000000
 75%           5.000000
 max         198.000000
 dtype: float64,
 count    804302.000000
 mean          4.899083
 std           8.549456
 min           1.000000
 25%           1.000000
 50%           2.000000
 75%           5.000000
 max         199.000000
 dtype: float64)

In [4]:
def apply_min_sequence_length(dataset):
    aggregated = dataset.groupby(['session_id_hash']).size()
    filtered = aggregated.apply(lambda v: v >= 2)
    filtered = filtered.reset_index()
    filtered.columns = ['session_id_hash', 'min_sequence_bool']
    ids = filtered[filtered['min_sequence_bool']]['session_id_hash'].tolist()
    dataset = dataset[dataset['session_id_hash'].isin(ids)].copy()
    return dataset

In [5]:
train = apply_min_sequence_length(train)
validation = apply_min_sequence_length(validation)
test = apply_min_sequence_length(test)

In [6]:
train.groupby(['session_id_hash']).size().describe(), validation.groupby(['session_id_hash']).size().describe(), test.groupby(['session_id_hash']).size().describe()

(count    1.940428e+06
 mean     8.746843e+00
 std      1.116830e+01
 min      2.000000e+00
 25%      3.000000e+00
 50%      5.000000e+00
 75%      1.000000e+01
 max      1.990000e+02
 dtype: float64,
 count    423732.000000
 mean          8.472414
 std          10.728311
 min           2.000000
 25%           3.000000
 50%           5.000000
 75%          10.000000
 max         198.000000
 dtype: float64,
 count    427486.000000
 mean          8.336006
 std          10.597615
 min           2.000000
 25%           3.000000
 50%           5.000000
 75%           9.000000
 max         199.000000
 dtype: float64)

---

In [7]:
train.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)
validation.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)
test.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)

In [8]:
train, validation, test

(                                            session_id_hash     event_type  \
 0         00000277639fc5c6f816654b78bf3654ece7fd53a7338f...       pageview   
 1         00000277639fc5c6f816654b78bf3654ece7fd53a7338f...       pageview   
 2         00000277639fc5c6f816654b78bf3654ece7fd53a7338f...       pageview   
 3         00000277639fc5c6f816654b78bf3654ece7fd53a7338f...       pageview   
 4         00000277639fc5c6f816654b78bf3654ece7fd53a7338f...       pageview   
 ...                                                     ...            ...   
 18388255  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...       pageview   
 18388256  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 18388257  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 18388258  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 18388259  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...       pageview   
 
          product_action                          

In [9]:
train.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.train.csv", sep="\t", index=False)
validation.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.validation.csv", sep="\t", index=False)
test.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/with_pageviews/coveo.test.csv", sep="\t", index=False)

---
### Without Pageviews

In [11]:
train = pd.read_csv("../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.train.csv", sep="\t")
validation = pd.read_csv("../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.validation.csv", sep="\t")
test = pd.read_csv("../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.test.csv", sep="\t")

---
### Min sequence length 2

In [12]:
train.groupby(['session_id_hash']).size().describe(), validation.groupby(['session_id_hash']).size().describe(), test.groupby(['session_id_hash']).size().describe()

(count    2.216858e+06
 mean     3.279428e+00
 std      4.970237e+00
 min      1.000000e+00
 25%      1.000000e+00
 50%      2.000000e+00
 75%      3.000000e+00
 max      1.870000e+02
 dtype: float64,
 count    523412.000000
 mean          2.976361
 std           4.499824
 min           1.000000
 25%           1.000000
 50%           1.000000
 75%           3.000000
 max         157.000000
 dtype: float64,
 count    538790.000000
 mean          2.891410
 std           4.238758
 min           1.000000
 25%           1.000000
 50%           1.000000
 75%           3.000000
 max         113.000000
 dtype: float64)

In [13]:
train = apply_min_sequence_length(train)
validation = apply_min_sequence_length(validation)
test = apply_min_sequence_length(test)

In [14]:
train.groupby(['session_id_hash']).size().describe(), validation.groupby(['session_id_hash']).size().describe(), test.groupby(['session_id_hash']).size().describe()

(count    1.115034e+06
 mean     5.531852e+00
 std      6.237484e+00
 min      2.000000e+00
 25%      2.000000e+00
 50%      3.000000e+00
 75%      6.000000e+00
 max      1.870000e+02
 dtype: float64,
 count    243158.000000
 mean          5.254234
 std           5.821973
 min           2.000000
 25%           2.000000
 50%           3.000000
 75%           6.000000
 max         157.000000
 dtype: float64,
 count    247383.000000
 mean          5.119414
 std           5.472980
 min           2.000000
 25%           2.000000
 50%           3.000000
 75%           6.000000
 max         113.000000
 dtype: float64)

---

In [15]:
train.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)
validation.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)
test.sort_values(['session_id_hash', 'server_timestamp_epoch_ms'], inplace=True)

In [16]:
train, validation, test

(                                           session_id_hash     event_type  \
 0        000010504025397b03290c7457e0e7ef7ae93529f21eae...  event_product   
 1        000010504025397b03290c7457e0e7ef7ae93529f21eae...  event_product   
 2        000010504025397b03290c7457e0e7ef7ae93529f21eae...  event_product   
 3        000010504025397b03290c7457e0e7ef7ae93529f21eae...  event_product   
 4        000010504025397b03290c7457e0e7ef7ae93529f21eae...  event_product   
 ...                                                    ...            ...   
 7270022  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 7270023  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 7270024  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 7270025  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 7270026  fffffc128ba14ec4d4b2a230b4352453843b3bb59becf5...  event_product   
 
         product_action                                   prod

In [17]:
train.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.train.csv", sep="\t", index=False)
validation.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.validation.csv", sep="\t", index=False)
test.to_csv(path_or_buf="../coveo_dataset/preprocessed_data/split/without_pageviews/coveo.test.csv", sep="\t", index=False)