## Preprocessing - Extract, Transform, Load

In [1]:
%load_ext autoreload
%autoreload all

In [2]:
# dependencies
import pandas as pd
import numpy as np

In [3]:
# local modules
import sys
sys.path.append("..")

from _helpers import constants
from _helpers import functions as hf
from _helpers.preprocess import preprocess

## Load the data

In [4]:
df_train = pd.read_parquet(constants.DROPPED_TRAIN)

We will be loading the data in chunks to save memory. Grouping by user - session pairs so we don't cut off any session in half. 

In [5]:
chunk_size = round(1e5)

In [6]:
unique_sessions = df_train[['user_id', 'session_id']].drop_duplicates()['session_id']
chunks =  [unique_sessions[i:i + chunk_size] for i in range(0, len(unique_sessions), chunk_size)]

From now on we will be working with a single chunk and then preprocess the rest at once.

In [7]:
df_chunk = df_train[df_train['session_id'].isin(chunks[0])]

## Extract features

### Select only columns with reference item_id

Select only those action_type rows, which contain item_id in reference column

In [8]:
item_reference_action_type_cols = [
    'clickout item', 'interaction item deals', 'interaction item image',
    'interaction item info', 'interaction item rating', 'search for item'
]

In [9]:
df_chunk = df_chunk[df_chunk.action_type.isin(item_reference_action_type_cols)].rename(columns={'reference': 'referenced_item'})
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,action_type,referenced_item,platform,city,device,current_filters,impressions,prices
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2103317,0O42Q0CC10C0,782b7db1fdbe5,1541486883,4,interaction item image,3205604,PH,"Vigan City, Philippines",desktop,,,
2103318,0O42Q0CC10C0,782b7db1fdbe5,1541486883,5,interaction item image,3205604,PH,"Vigan City, Philippines",desktop,,,
2103319,0O42Q0CC10C0,782b7db1fdbe5,1541487048,6,interaction item image,3760746,PH,"Vigan City, Philippines",desktop,,,
2103320,0O42Q0CC10C0,782b7db1fdbe5,1541487048,7,interaction item image,3760746,PH,"Vigan City, Philippines",desktop,,,


### Add previous_item column

Get item id of previous interaction of a user in a session

In [10]:
df_chunk.insert(
    loc=df_chunk.columns.get_loc("referenced_item") + 1,  # Insert previous item after reference column
    column='previous_item',
    value=df_chunk.sort_values(
        by=["user_id", "session_id", "timestamp", "step"],
        ascending=[True, True, True, True]
    )
    .groupby(["user_id"])["referenced_item"]
    .shift(1)
)
df_chunk.head(10)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,referenced_item,previous_item,platform,city,device,current_filters,impressions,prices
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,666856.0,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,666856.0,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,666856.0,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,109038.0,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,666856.0,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,109038.0,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,666856.0,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,109038.0,AU,"Sydney, Australia",mobile,,,
10,00RL8Z82B2Z1,aff3928535f48,1541037542,11,interaction item image,109038,109038.0,AU,"Sydney, Australia",mobile,,,


### Explode impressions and its prices into separate rows

As impressions are only present during clickouts, and we have multiple action_types, we will create a new column, interacted_item which will contain either impressions, or referenced item (item subject of action_type)

In [11]:
df_chunk.insert(
    loc=df_chunk.columns.get_loc("referenced_item") + 1,  # Insert previous item after reference column
    column='interacted_item',
    value=np.where(
        df_chunk.impressions.isna(),
        df_chunk.referenced_item,
        df_chunk.impressions
    )
)

In [12]:
df_chunk.loc[:, "prices"] = np.where(
        df_chunk.prices.isna(),
        "",
        df_chunk.prices
    )
    
df_chunk = hf.explode(df_chunk, ['interacted_item', 'prices'])

In [13]:
# Rename prices since it represents single value now
df_chunk = df_chunk.rename(columns={'prices': 'price'})
    
# Don't need this column anymore as it
df_chunk = df_chunk.drop(columns="impressions")

In [14]:
df_chunk = hf.reorder_column(df_chunk, df_chunk.columns.get_loc("interacted_item"), df_chunk.columns.get_loc("referenced_item") + 1)
df_chunk = hf.reorder_column(df_chunk, df_chunk.columns.get_loc("price"), df_chunk.columns.get_loc("interacted_item") + 1)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,action_type,referenced_item,interacted_item,price,previous_item,platform,city,device,current_filters
0,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,666856,,,AU,"Sydney, Australia",mobile,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,666856,,666856,AU,"Sydney, Australia",mobile,
2,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,666856,,666856,AU,"Sydney, Australia",mobile,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,109038,,666856,AU,"Sydney, Australia",mobile,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,666856,,109038,AU,"Sydney, Australia",mobile,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6252783,0O42Q0CC10C0,782b7db1fdbe5,1541486883,4,interaction item image,3205604,3205604,,3205604,PH,"Vigan City, Philippines",desktop,
6252784,0O42Q0CC10C0,782b7db1fdbe5,1541486883,5,interaction item image,3205604,3205604,,3205604,PH,"Vigan City, Philippines",desktop,
6252785,0O42Q0CC10C0,782b7db1fdbe5,1541487048,6,interaction item image,3760746,3760746,,3205604,PH,"Vigan City, Philippines",desktop,
6252786,0O42Q0CC10C0,782b7db1fdbe5,1541487048,7,interaction item image,3760746,3760746,,3760746,PH,"Vigan City, Philippines",desktop,


The size of our dataset in rows just went up, but now we have single action and single information about item.

The `referenced_item` and `interacted_item` seems to be same, but it isn't. For example during clickouts, in `interacted_item` value represents impression, but `referenced_item` is the item user has clicked out.

### User modelling - add information about how many times did user interact with the item

In [15]:
df_chunk.insert(
    loc=df_chunk.columns.get_loc("interacted_item") + 1,  # Insert previous item after reference column
    column='user_interacted_item_interaction_count',
    value=(
        df_chunk
        .groupby(["user_id", "interacted_item"])
        .cumcount()
    )
)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,action_type,referenced_item,interacted_item,user_interacted_item_interaction_count,price,previous_item,platform,city,device,current_filters
0,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,666856,0,,,AU,"Sydney, Australia",mobile,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,666856,1,,666856,AU,"Sydney, Australia",mobile,
2,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,666856,2,,666856,AU,"Sydney, Australia",mobile,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,109038,0,,666856,AU,"Sydney, Australia",mobile,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,666856,3,,109038,AU,"Sydney, Australia",mobile,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6252783,0O42Q0CC10C0,782b7db1fdbe5,1541486883,4,interaction item image,3205604,3205604,3,,3205604,PH,"Vigan City, Philippines",desktop,
6252784,0O42Q0CC10C0,782b7db1fdbe5,1541486883,5,interaction item image,3205604,3205604,4,,3205604,PH,"Vigan City, Philippines",desktop,
6252785,0O42Q0CC10C0,782b7db1fdbe5,1541487048,6,interaction item image,3760746,3760746,2,,3205604,PH,"Vigan City, Philippines",desktop,
6252786,0O42Q0CC10C0,782b7db1fdbe5,1541487048,7,interaction item image,3760746,3760746,3,,3760746,PH,"Vigan City, Philippines",desktop,


### Narrow to clickouts

Since we already aquired all information we needed from different action types, we can drop them now.

In [16]:
df_chunk = df_chunk[df_chunk.action_type == "clickout item"] \
    .drop(columns="action_type") \
    .rename(columns={"interacted_item": "impressed_item"}) \
    .rename(columns={"user_interacted_item_interaction_count": "user_impressed_item_interaction_count"})
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,referenced_item,impressed_item,user_impressed_item_interaction_count,price,previous_item,platform,city,device,current_filters
12,00RL8Z82B2Z1,aff3928535f48,1541037543,14,109038,3400638,0,95,109038,AU,"Sydney, Australia",mobile,
13,00RL8Z82B2Z1,aff3928535f48,1541037543,14,109038,1253714,0,66,109038,AU,"Sydney, Australia",mobile,
14,00RL8Z82B2Z1,aff3928535f48,1541037543,14,109038,3367857,0,501,109038,AU,"Sydney, Australia",mobile,
15,00RL8Z82B2Z1,aff3928535f48,1541037543,14,109038,5100540,0,112,109038,AU,"Sydney, Australia",mobile,
16,00RL8Z82B2Z1,aff3928535f48,1541037543,14,109038,1088584,0,95,109038,AU,"Sydney, Australia",mobile,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6252777,0O42Q0CC10C0,782b7db1fdbe5,1541485321,2,3760746,2506344,1,49,3760746,PH,"Vigan City, Philippines",desktop,
6252778,0O42Q0CC10C0,782b7db1fdbe5,1541485321,2,3760746,1449677,1,68,3760746,PH,"Vigan City, Philippines",desktop,
6252779,0O42Q0CC10C0,782b7db1fdbe5,1541485321,2,3760746,6410668,1,38,3760746,PH,"Vigan City, Philippines",desktop,
6252780,0O42Q0CC10C0,782b7db1fdbe5,1541485321,2,3760746,10037154,1,14,3760746,PH,"Vigan City, Philippines",desktop,


### Add information about price

We want to find out whether user might be interested in the impression because of the price, if it's below or above mean of all impressions

In [17]:
df_chunk.loc[:, 'price'] = df_chunk['price'].astype(int)

df_mean_price = df_chunk \
    .groupby(['user_id', 'session_id', 'timestamp', 'step', 'impressed_item']) \
    .agg(mean_price=('price', 'mean')) \
    .reset_index()
    
df_chunk = df_mean_price.merge(
    df_chunk,
    on=['user_id', 'session_id', 'step', 'timestamp', 'impressed_item'],
)
    
df_chunk['price_above_impression_mean'] = (df_chunk['price'] > df_chunk['mean_price']).astype(int)
df_chunk = hf.reorder_column(df_chunk, df_chunk.columns.get_loc("price_above_impression_mean"), df_chunk.columns.get_loc("price") + 1)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,impressed_item,mean_price,referenced_item,user_impressed_item_interaction_count,price,price_above_impression_mean,previous_item,platform,city,device,current_filters
0,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110976,70.0,2627602,0,70,0,3381482,DE,"Valencia, Spain",tablet,
1,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110979,78.0,2627602,0,78,0,3381482,DE,"Valencia, Spain",tablet,
2,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110981,96.0,2627602,0,96,0,3381482,DE,"Valencia, Spain",tablet,
3,0004IOZI7CKF,0146f7cb014ba,1541266796,3,150904,150.0,2627602,0,150,0,3381482,DE,"Valencia, Spain",tablet,
4,0004IOZI7CKF,0146f7cb014ba,1541266796,3,159015,117.0,2627602,0,117,0,3381482,DE,"Valencia, Spain",tablet,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581350,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,478871,92.0,3148916,0,92,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581351,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5188226,45.0,3148916,0,45,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581352,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5659850,95.0,3148916,0,95,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581353,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5790750,324.0,3148916,0,324,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,


### Impression position

Users might me more likely to click on items which are at top of the page than to scroll on the page, therefore we add a column which says impression position could be helpful

In [18]:
df_chunk.insert(
    loc=df_chunk.columns.get_loc("impressed_item") + 1,  # Insert previous item after reference column
    column='impressed_item_position',
    value=(
        df_chunk
        .groupby(["user_id", "session_id", "timestamp", "step"])
        .cumcount() + 1
    )
)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,impressed_item,impressed_item_position,mean_price,referenced_item,user_impressed_item_interaction_count,price,price_above_impression_mean,previous_item,platform,city,device,current_filters
0,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110976,1,70.0,2627602,0,70,0,3381482,DE,"Valencia, Spain",tablet,
1,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110979,2,78.0,2627602,0,78,0,3381482,DE,"Valencia, Spain",tablet,
2,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110981,3,96.0,2627602,0,96,0,3381482,DE,"Valencia, Spain",tablet,
3,0004IOZI7CKF,0146f7cb014ba,1541266796,3,150904,4,150.0,2627602,0,150,0,3381482,DE,"Valencia, Spain",tablet,
4,0004IOZI7CKF,0146f7cb014ba,1541266796,3,159015,5,117.0,2627602,0,117,0,3381482,DE,"Valencia, Spain",tablet,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581350,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,478871,21,92.0,3148916,0,92,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581351,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5188226,22,45.0,3148916,0,45,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581352,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5659850,23,95.0,3148916,0,95,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581353,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5790750,24,324.0,3148916,0,324,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,


### User modelling - Add last_interacted column

Add a column which presents whether impressed_item the last interacted_item. User is more likely to clickout the item (therefore have it in the referenced_item), if they interacted with it just in the previous session step

E.g. they interacted with the image of the item, therefore if impressed_item equals previous_item, there is a higher chance to be in referenced_item column (therefore our prediction target)

ite

In [19]:
df_chunk.insert(
    loc=df_chunk.columns.get_loc("user_impressed_item_interaction_count") + 1,
    column='is_last_interacted',
    value=(df_chunk["previous_item"] == df_chunk["impressed_item"]).astype(int)
)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,impressed_item,impressed_item_position,mean_price,referenced_item,user_impressed_item_interaction_count,is_last_interacted,price,price_above_impression_mean,previous_item,platform,city,device,current_filters
0,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110976,1,70.0,2627602,0,0,70,0,3381482,DE,"Valencia, Spain",tablet,
1,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110979,2,78.0,2627602,0,0,78,0,3381482,DE,"Valencia, Spain",tablet,
2,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110981,3,96.0,2627602,0,0,96,0,3381482,DE,"Valencia, Spain",tablet,
3,0004IOZI7CKF,0146f7cb014ba,1541266796,3,150904,4,150.0,2627602,0,0,150,0,3381482,DE,"Valencia, Spain",tablet,
4,0004IOZI7CKF,0146f7cb014ba,1541266796,3,159015,5,117.0,2627602,0,0,117,0,3381482,DE,"Valencia, Spain",tablet,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581350,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,478871,21,92.0,3148916,0,0,92,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581351,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5188226,22,45.0,3148916,0,0,45,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581352,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5659850,23,95.0,3148916,0,0,95,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,
4581353,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5790750,24,324.0,3148916,0,0,324,0,3148916,BR,"Rio de Janeiro, Brazil",mobile,


## Encoding

We don't need string information about device platform or city, we can transform those categorical values into codes

In [20]:
df_chunk = df_chunk.copy()
categorical_attributes = ['device', 'platform', 'city']

for cat in categorical_attributes:
    df_chunk[cat] = df_chunk[cat].astype('category').cat.codes
    
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,impressed_item,impressed_item_position,mean_price,referenced_item,user_impressed_item_interaction_count,is_last_interacted,price,price_above_impression_mean,previous_item,platform,city,device,current_filters
0,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110976,1,70.0,2627602,0,0,70,0,3381482,14,11474,2,
1,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110979,2,78.0,2627602,0,0,78,0,3381482,14,11474,2,
2,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110981,3,96.0,2627602,0,0,96,0,3381482,14,11474,2,
3,0004IOZI7CKF,0146f7cb014ba,1541266796,3,150904,4,150.0,2627602,0,0,150,0,3381482,14,11474,2,
4,0004IOZI7CKF,0146f7cb014ba,1541266796,3,159015,5,117.0,2627602,0,0,117,0,3381482,14,11474,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581350,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,478871,21,92.0,3148916,0,0,92,0,3148916,7,9075,1,
4581351,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5188226,22,45.0,3148916,0,0,45,0,3148916,7,9075,1,
4581352,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5659850,23,95.0,3148916,0,0,95,0,3148916,7,9075,1,
4581353,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5790750,24,324.0,3148916,0,0,324,0,3148916,7,9075,1,


## Using item metadata

In [21]:
df_meta = pd.read_csv(constants.METADATA, dtype={'item_id': str})
df_meta

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...
...,...,...
927137,11199990,House / Apartment
927138,11236100,House / Apartment
927139,11260526,House / Apartment
927140,11263712,House / Apartment


Separate properties into arrays

In [22]:
df_meta.loc[:, 'properties'] = df_meta.loc[:, 'properties'].str.split("|")
df_meta

Unnamed: 0,item_id,properties
0,5101,"[Satellite TV, Golf Course, Airport Shuttle, C..."
1,5416,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ..."
2,5834,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ..."
3,5910,"[Satellite TV, Sailing, Cosmetic Mirror, Telep..."
4,6066,"[Satellite TV, Sailing, Diving, Cosmetic Mirro..."
...,...,...
927137,11199990,[House / Apartment]
927138,11236100,[House / Apartment]
927139,11260526,[House / Apartment]
927140,11263712,[House / Apartment]


### Parsing rating from item metadata

Rozdelenie "|" separovaných hodnôt na list.

In [23]:
 rating_map = {
    'Satisfactory Rating': 7.0,
    'Good Rating': 7.5,
    'Very Good Rating': 8.0,
    'Excellent Rating': 8.5
}
        
# Properties may contain multiple ratings, all of those which apply, we need to find a maximum
df_meta['impressed_item_rating'] = df_meta['properties']\
    .apply(lambda x: max([rating_map[key] for key in x if key in rating_map], default=None))

df_meta

Unnamed: 0,item_id,properties,impressed_item_rating
0,5101,"[Satellite TV, Golf Course, Airport Shuttle, C...",7.50
1,5416,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",8.50
2,5834,"[Satellite TV, Cosmetic Mirror, Safe (Hotel), ...",8.00
3,5910,"[Satellite TV, Sailing, Cosmetic Mirror, Telep...",7.50
4,6066,"[Satellite TV, Sailing, Diving, Cosmetic Mirro...",7.50
...,...,...,...
927137,11199990,[House / Apartment],7.75
927138,11236100,[House / Apartment],7.75
927139,11260526,[House / Apartment],7.75
927140,11263712,[House / Apartment],7.75


### Merging metadata

Now using this metadata about item rating, we may merge the values with dataset

In [24]:
df_chunk = df_chunk.merge(
    df_meta[['item_id','impressed_item_rating']],
    left_on='impressed_item',
    right_on='item_id',
    how='left'
)

df_chunk = df_chunk.drop(columns='item_id')

# Fill NaN with mean
df_chunk['impressed_item_rating'] = df_chunk['impressed_item_rating'].fillna(df_chunk['impressed_item_rating'].mean())

df_chunk = hf.reorder_column(df_chunk, df_chunk.columns.get_loc("impressed_item_rating"), df_chunk.columns.get_loc("impressed_item_position") + 1)
df_chunk

Unnamed: 0,user_id,session_id,timestamp,step,impressed_item,impressed_item_position,impressed_item_rating,mean_price,referenced_item,user_impressed_item_interaction_count,is_last_interacted,price,price_above_impression_mean,previous_item,platform,city,device,current_filters
0,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110976,1,7.50,70.0,2627602,0,0,70,0,3381482,14,11474,2,
1,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110979,2,8.00,78.0,2627602,0,0,78,0,3381482,14,11474,2,
2,0004IOZI7CKF,0146f7cb014ba,1541266796,3,110981,3,8.00,96.0,2627602,0,0,96,0,3381482,14,11474,2,
3,0004IOZI7CKF,0146f7cb014ba,1541266796,3,150904,4,8.50,150.0,2627602,0,0,150,0,3381482,14,11474,2,
4,0004IOZI7CKF,0146f7cb014ba,1541266796,3,159015,5,8.00,117.0,2627602,0,0,117,0,3381482,14,11474,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581350,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,478871,21,8.00,92.0,3148916,0,0,92,0,3148916,7,9075,1,
4581351,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5188226,22,8.00,45.0,3148916,0,0,45,0,3148916,7,9075,1,
4581352,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5659850,23,8.50,95.0,3148916,0,0,95,0,3148916,7,9075,1,
4581353,ZZZHCOZVA3IM,9555befa755ea,1541351984,3,5790750,24,8.50,324.0,3148916,0,0,324,0,3148916,7,9075,1,


## Conclusion

This was an example of ETL on one chunk out of training data. We also need to preprocess all training and testing data.

In [25]:
# Iterate over the chunks of training data
for i, chunk in enumerate(chunks):
    df_chunk = df_train[df_train['session_id'].isin(chunk)]
    processed_chunk = preprocess(df_chunk, df_meta)

    processed_chunk.to_parquet(constants.PREPROCESSED(i, 'train'), index=False)
    print(f"Chunk {i} saved to {constants.PREPROCESSED(i, 'train')}.")

Chunk 0 saved to /home/data/preprocessed/train_0.parquet.
Chunk 1 saved to /home/data/preprocessed/train_1.parquet.
Chunk 2 saved to /home/data/preprocessed/train_2.parquet.
Chunk 3 saved to /home/data/preprocessed/train_3.parquet.
Chunk 4 saved to /home/data/preprocessed/train_4.parquet.
Chunk 5 saved to /home/data/preprocessed/train_5.parquet.
Chunk 6 saved to /home/data/preprocessed/train_6.parquet.


And do the same with test data

In [28]:
# load data
df_test = pd.read_parquet(constants.DROPPED_TEST)

# Get N chunks of unique user-sessions pairs (N = chunk_size)
unique_sessions = df_test[['user_id', 'session_id']].drop_duplicates()['session_id']
chunks = [unique_sessions[i:i + chunk_size] for i in range(0, len(unique_sessions), chunk_size)]

# Iterate over the chunks
for i, chunk in enumerate(chunks):
    df_chunk = df_test[df_test['session_id'].isin(chunk)]
    processed_chunk = preprocess(df_chunk, df_meta)

    processed_chunk.to_parquet(constants.PREPROCESSED(i, 'test'), index=False)
    print(f"Chunk {i} saved to {constants.PREPROCESSED(i, 'test')}.")

Chunk 0 saved to /home/data/preprocessed/test_0.parquet.
Chunk 1 saved to /home/data/preprocessed/test_1.parquet.
Chunk 2 saved to /home/data/preprocessed/test_2.parquet.
