# How to Handle the List Columns

In [39]:
import pandas as pd
import numpy as np

In [40]:
# adjust file paths
train_sessions_path = "./sessions_train.csv"
test_sessions_path = "./sessions_test_task1.csv"

# load into dataframes
train_df = pd.read_csv(train_sessions_path)
test_df = pd.read_csv(test_sessions_path)

# print info to inspect datatypes
print(train_df.info())
print("---------------")
print(test_df.info())

# show first rows
display(train_df.head())
display(test_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3606249 entries, 0 to 3606248
Data columns (total 3 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   prev_items  object
 1   next_item   object
 2   locale      object
dtypes: object(3)
memory usage: 82.5+ MB
None
---------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316971 entries, 0 to 316970
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   prev_items  316971 non-null  object
 1   locale      316971 non-null  object
dtypes: object(2)
memory usage: 4.8+ MB
None


Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE


Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE


In [41]:
test_df.iloc[0, 0]

"['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7']"

In [42]:
# print first element in the "prev_items" column...
display(train_df.iloc[0, 0])
# ...and its type
print(type(train_df.iloc[0, 0]))
# do the same for the test df
display(test_df.iloc[0, 0])
print(type(test_df.iloc[0, 0]))

"['B09W9FND7K' 'B09JSPLN1M']"

<class 'str'>


"['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7']"

<class 'str'>


In [43]:
# the previous items are stored as one single string
# the strings also contain line breaks (\n)
# this example shows how the strings can be converted to lists of strings containing the product ids only

test_example = test_df.iloc[0, 0]

# print initial string
display(test_example)
print(type(test_example))

# this removes the [] characters and the first and last ' in the string
test_example = test_example.strip("[']")
display(test_example)
print(type(test_example))

# this removes new line characters (\n)
test_example = test_example.replace("\n", "")
display(test_example)
print(type(test_example))

# now the string can be split up in substrings at every "' '"
# the resulting object is a list
test_example = test_example.split("' '")
display(test_example)
print(type(test_example))

"['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7']"

<class 'str'>


"B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931'\n 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ'\n 'B01BVG1XJS' 'B099NQFMG7"

<class 'str'>


"B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC5PKN5' 'B09V7KG931' 'B09PY75FWM' 'B09PXYT6BT' 'B08V12CT4C' 'B08V1KXBQD' 'B08496TCCQ' 'B01BVG1XJS' 'B099NQFMG7"

<class 'str'>


['B08V12CT4C',
 'B08V1KXBQD',
 'B01BVG1XJS',
 'B09VC5PKN5',
 'B09V7KG931',
 'B09PY75FWM',
 'B09PXYT6BT',
 'B08V12CT4C',
 'B08V1KXBQD',
 'B08496TCCQ',
 'B01BVG1XJS',
 'B099NQFMG7']

<class 'list'>


In [44]:
# we can use the python string methods directly on pandas dataframe columns by adding .str in front of the method call
# this way the operation is vectorized and faster than if we would use the apply method and a custom function
train_df['prev_items'] = train_df['prev_items'].str.strip("[']").str.replace("\n", "").str.split("' '")
test_df['prev_items'] = test_df['prev_items'].str.strip("[']").str.replace("\n", "").str.split("' '")

In [45]:
# inspect the data types again

# print first element in the "prev_items" column...
display(train_df.iloc[0, 0])
# ...and its type
print(type(train_df.iloc[0, 0]))
# do the same for the test df
display(test_df.iloc[0, 0])
print(type(test_df.iloc[0, 0]))

['B09W9FND7K', 'B09JSPLN1M']

<class 'list'>


['B08V12CT4C',
 'B08V1KXBQD',
 'B01BVG1XJS',
 'B09VC5PKN5',
 'B09V7KG931',
 'B09PY75FWM',
 'B09PXYT6BT',
 'B08V12CT4C',
 'B08V1KXBQD',
 'B08496TCCQ',
 'B01BVG1XJS',
 'B099NQFMG7']

<class 'list'>


In [54]:
# still, using lists could be difficult in some use cases
test_df.head()

Unnamed: 0,prev_items,locale
0,"[B08V12CT4C, B08V1KXBQD, B01BVG1XJS, B09VC5PKN...",DE
1,"[B00R9R5ND6, B00R9RZ9ZS, B00R9RZ9ZS]",DE
2,"[B07YSRXJD3, B07G7Q5N6G, B08C9Q7QVK, B07G7Q5N6G]",DE
3,"[B08KQBYV43, 3955350843, 3955350843, 395535086...",DE
4,"[B09FPTCWMC, B09FPTQP68, B08HMRY8NG, B08TBBQ4B...",DE


In [53]:
# the following pandas method crates seperate rows for each column where each row corresponds to one of the list items
# the index is repeated, so we can still access the different sessions from the index
test_df.explode('prev_items')

Unnamed: 0,prev_items,locale
0,B08V12CT4C,DE
0,B08V1KXBQD,DE
0,B01BVG1XJS,DE
0,B09VC5PKN5,DE
0,B09V7KG931,DE
...,...,...
316969,B01MCQMORK,UK
316969,B09JYZ325W,UK
316970,B0B8JX92YJ,UK
316970,B09TN4MP6V,UK


In [55]:
train_df.explode('prev_items')

Unnamed: 0,prev_items,next_item,locale
0,B09W9FND7K,B09M7GY217,DE
0,B09JSPLN1M,B09M7GY217,DE
1,B076THCGSG,B001B4THSA,DE
1,B007MO8IME,B001B4THSA,DE
1,B08MF65MLV,B001B4THSA,DE
...,...,...,...
3606247,B00B0UING2,B00D3HYEZ4,IT
3606248,B092S9D1SD,B0B7RX65YP,IT
3606248,B09XQQ1S72,B0B7RX65YP,IT
3606248,B0852MS7QC,B0B7RX65YP,IT


In [56]:
# for example we can access session 0 like this
test_df.explode('prev_items').loc[0, :]

Unnamed: 0,prev_items,locale
0,B08V12CT4C,DE
0,B08V1KXBQD,DE
0,B01BVG1XJS,DE
0,B09VC5PKN5,DE
0,B09V7KG931,DE
0,B09PY75FWM,DE
0,B09PXYT6BT,DE
0,B08V12CT4C,DE
0,B08V1KXBQD,DE
0,B08496TCCQ,DE
