# Convert to Parquet  
This notebook converts all data, except for image encodings, into Parquet files and organizes them. The organization includes consolidating all orders into a single file and creating a separate file for tags to facilitate data exploration.  

I chose Parquet because it allows for faster data loading and preserves data types, unlike CSV files, which require type specification each time they are loaded.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'

In [None]:
arcive_path=path+'/archive'

In [None]:
!pip install pyarrow

import warnings
warnings.filterwarnings('ignore')

#Import libreirs
import pandas as pd
import numpy as np
import re

#Visulizetion
import seaborn as sns
import matplotlib.pyplot as plt

#Garbage collector
import gc

#Random
import random
#Set random seed
RSEED = 10
random.seed(RSEED)



In [None]:
outfits=pd.read_csv(arcive_path+'/outfits.csv', delimiter=';')
outfits.head()

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories
0,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Out of stock - Asymmetric Frilled Dress,"This fun, short dress features and asymmetric ...",group.50a586c78eb7626e294ba3bd07d12c79,o_00053,2017-12-30 11:28:01.000,4000.0,600.0,1200.0,"['Synthetic', 'Statement', 'Dresses', 'Metalli...","['Material', 'Occasion', 'Category', 'Details'..."
1,outfit.fffa1b9a3db6415d806f3c48f8ab58d9,Yellow Shell Mellomholmene Blouse,This beautiful blouse features an adjustable n...,group.61ad2fcabb3e9197e3836376e6b67f2c,o_00577,2021-06-07 12:07:22.921,1300.0,590.0,1180.0,"['ILAG', 'Tops', 'Spring', 'Summer', 'M', 'Pat...","['Brand', 'Category', 'Seasons', 'Seasons', 'S..."
2,outfit.fff175b13ceb453f9928625491412ede,Kaula Dress Black,Kaula from Rodebjer is a fitted dress made in ...,group.37c2b59d63d3a9c2d58e07f532f71f7f,o_00336,2023-06-05 09:17:59.004,3100.0,930.0,1860.0,"['Black', 'Mini', 'M', 'Everyday', 'Multi Seas...","['Color', 'Length', 'Size', 'Occasion', 'Seaso..."
3,outfit.ffef9d7c292a48b69076d2df2e32352f,For sale - Jarvis Blouse,This wrap blouse has mid length sleeves and a ...,group.dfcaa57546b0b7a5e9eb204449b6cc1c,o_00030,2021-05-18 14:02:28.690,1500.0,590.0,1180.0,"['XS', 'Multi Season', 'Stylein', 'Tops', 'Cot...","['Size', 'Seasons', 'Brand', 'Category', 'Mate..."
4,outfit.ffeef842238f4dbdabc6c730a75aa2bd,Black Amber Pants,"Feel slack and nice dressed with this pant, ma...",group.ee297c977905eb21a123a4aea5fbb6d2,o_00602,2021-07-16 14:02:30.643,1200.0,590.0,1180.0,"['Cotton', 'Black', 'Everyday', 'Knitwear', 'L...","['Material', 'Color', 'Occasion', 'Category', ..."


In [None]:
outfits.dtypes

id                 object
name               object
description        object
group              object
owner              object
timeCreated        object
retailPrice       float64
pricePerWeek      float64
pricePerMonth     float64
outfit_tags        object
tag_categories     object
dtype: object

In [None]:
outfits['timeCreated']=pd.to_datetime(outfits['timeCreated'])

In [None]:
outfits.to_parquet(arcive_path+'/data/outfits.parquet',engine='pyarrow')

In [None]:
# Create a new DataFrame with id and each tag in outfit_tags as a separate row
new_rows = []

for index, row in outfits.iterrows():
    tags = eval(row['outfit_tags'])
    for tag in tags:
        new_rows.append({'id': row['id'], 'tag': tag})

outfit_tags = pd.DataFrame(new_rows)

# Display the new DataFrame
display(outfit_tags.head())

Unnamed: 0,id,tag
0,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Synthetic
1,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Statement
2,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Dresses
3,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Metallic
4,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Mini


In [None]:
# Create a new DataFrame with id, tag, and category
new_rows = []

for index, row in outfits.iterrows():
    tags = eval(row['outfit_tags'])
    categories = eval(row['tag_categories'])
    for tag, category in zip(tags, categories):
        new_rows.append({'id': row['id'], 'tag': tag, 'category': category})

outfit_tags = pd.DataFrame(new_rows)

# Display the new DataFrame
display(outfit_tags.head())

Unnamed: 0,id,tag,category
0,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Synthetic,Material
1,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Statement,Occasion
2,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Dresses,Category
3,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Metallic,Details
4,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Mini,Length


In [None]:
outfit_tags.to_parquet(arcive_path+'/data/outfit_tags.parquet',engine='pyarrow')

In [None]:
# Create a new DataFrame with id and each tag in outfit_tags as a separate row
new_rows = []

for index, row in outfits.iterrows():
    tags = eval(row['tag_categories'])
    for tag in tags:
        new_rows.append({'id': row['id'], 'tag': tag})

tag_categories = pd.DataFrame(new_rows)

# Display the new DataFrame
display(tag_categories.head())

Unnamed: 0,id,tag
0,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Material
1,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Occasion
2,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Category
3,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Details
4,outfit.fffdaa715c3646f8b1c0f04d549ff07e,Length


In [None]:
picture_triplets=pd.read_csv(arcive_path+'/picture_triplets.csv', delimiter=';')
picture_triplets.head()

Unnamed: 0,picture.id,outfit.id,displayOrder,file_name
0,picture.0000cdba64314d84a49ed1c266589cc0,outfit.794483397da8425a813301eecf9828c6,0,0000cdba64314d84a49ed1c266589cc0.jpg
1,picture.00058abb53434872ae9bb4270ae21f8e,outfit.98f32aaf08bc4ff09c44e6e11e9199bc,2,00058abb53434872ae9bb4270ae21f8e.jpg
2,picture.00063f52c36d43ada95da45f819b30b4,outfit.9fd1c42c3db543c5b6e53b0db1ee8c0f,3,00063f52c36d43ada95da45f819b30b4.jpg
3,picture.0008443461814f5c988f123718bbd20e,outfit.a7539783b6e94591bdf4e10339afc1d7,3,0008443461814f5c988f123718bbd20e.jpg
4,picture.000a5db3362049aebcc1eb2bf7bde95f,outfit.745fa2bc8156478bac6c0f7d46dadbda,1,000a5db3362049aebcc1eb2bf7bde95f.jpg


In [None]:
picture_triplets.dtypes

picture.id      object
outfit.id       object
displayOrder     int64
file_name       object
dtype: object

In [None]:
picture_triplets.to_parquet(arcive_path+'/data/picture_triplets.parquet',engine='pyarrow')

In [None]:
user_activity_triplets=pd.read_csv(arcive_path+'/user_activity_triplets.csv', delimiter=';')
user_activity_triplets['rentalPeriod.start']=pd.to_datetime(user_activity_triplets['rentalPeriod.start'])
user_activity_triplets['rentalPeriod.end']=pd.to_datetime(user_activity_triplets['rentalPeriod.end'])
display(user_activity_triplets.head())
display(user_activity_triplets.dtypes)
#the records of the transactions.

Unnamed: 0,customer.id,outfit.id,rentalPeriod.start,rentalPeriod.end
0,3448,outfit.5c081909537b42239e465d2d615c705f,2023-03-26,2023-04-25
1,2924,outfit.c34969dd8b334064aa90bfb60c8ec308,2023-03-27,2023-04-26
2,2924,outfit.aef4cc93eebf40ca8820790deb7a8323,2023-01-27,2023-02-26
3,1128,outfit.3de5df48a14b4a9aba6d8e41d11e9351,2021-11-16,2021-12-15
4,1128,outfit.0eaa358af14e469894062591bd42f38b,2021-10-11,2021-11-11


customer.id                    int64
outfit.id                     object
rentalPeriod.start    datetime64[ns]
rentalPeriod.end      datetime64[ns]
dtype: object

In [None]:
user_activity_triplets.shape

(64419, 4)

In [None]:
user_activity_triplets.sort_values('customer.id')

Unnamed: 0,customer.id,outfit.id,rentalPeriod.start,rentalPeriod.end
2632,0,outfit.9f5058295098471abdfaf0a7c74ddbfe,2023-12-06,2024-01-05
30,0,outfit.b77ec4404eef405aae1833e224314586,2023-11-24,2023-12-23
12427,0,outfit.d4b6896b1ae74cdabebfdcf948fe64e2,2023-11-24,2023-12-23
40176,0,outfit.85f26909d8334ab78f30c2fc9c73faf7,2023-11-22,2023-12-21
35619,3,outfit.b344d62ead214135a38ff1f67ecede48,2021-10-15,2021-11-01
...,...,...,...,...
30660,7413,outfit.78778f72774d4269abd8410a9511fdf6,2020-09-01,2020-09-30
30659,7413,outfit.2208707dcf134558b4d335f1e3534655,2020-09-01,2020-09-30
26127,7413,outfit.74b8a62f6ecf484d854c4360dcb8a761,2023-10-29,2023-11-28
3914,7413,outfit.cc2a4ea6b82044d2804ee26e593fbc00,2020-10-01,2020-10-30


In [None]:
# Get the first and last dates
first_date = user_activity_triplets['rentalPeriod.start'].min()
last_date = user_activity_triplets['rentalPeriod.end'].max()

print(f"The first date in the dataframe is {first_date.date()}")
print(f"The last date in the dataframe is {last_date.date()}")

The first date in the dataframe is 2019-10-07
The last date in the dataframe is 2024-08-17


In [None]:
original_orders=pd.read_csv(arcive_path+'/additional_tabular_data/original_orders.csv', delimiter=';')
original_orders['rentalPeriod.start']=pd.to_datetime(original_orders['rentalPeriod.start'])
original_orders['rentalPeriod.end']=pd.to_datetime(original_orders['rentalPeriod.end'])
display(original_orders.head())
display(original_orders.dtypes)
#the records of the transactions.

Unnamed: 0,customer.id,outfit.id,rentalPeriod.start,rentalPeriod.end
0,3945,outfit.923f3fd476b5450b9582d1f525604546,2018-05-25,2018-05-28
1,4088,outfit.8c8e922e228ba03f,2019-08-29,2019-09-02
2,4360,outfit.96f152543e7668ae,2018-08-10,2018-08-13
3,4697,outfit.ddba05a5ced34fa1ab3a0722c05bb11a,2018-06-14,2018-06-19
4,3890,outfit.5ef01d4dc15243fb854ca797716fd663,2019-08-24,2019-08-27


customer.id                    int64
outfit.id                     object
rentalPeriod.start    datetime64[ns]
rentalPeriod.end      datetime64[ns]
dtype: object

In [None]:
# Get the first and last dates
first_date = original_orders['rentalPeriod.start'].min()
last_date = original_orders['rentalPeriod.end'].max()

print(f"The first date in the dataframe is {first_date.date()}")
print(f"The last date in the dataframe is {last_date.date()}")

The first date in the dataframe is 2016-03-11
The last date in the dataframe is 2021-04-19


In [None]:
# Concatenate the two dataframes
orders = pd.concat([original_orders, user_activity_triplets])

# Remove duplicates
orders = orders.drop_duplicates()

# Display the resulting dataframe
orders.sample(10)

Unnamed: 0,customer.id,outfit.id,rentalPeriod.start,rentalPeriod.end
3876,4252,outfit.71e72caf9fa944108c3f62dc488eccd2,2018-05-28,2018-05-31
4635,2272,outfit.7e99984aa9604071bc49a4f148394fbc,2019-08-09,2019-08-12
47592,4725,outfit.5c9a7d3893564b12aa565d03de86ca39,2022-11-29,2022-12-28
33149,6482,outfit.cc0ef285bf6549bdaa1eec00c3574a4a,2023-05-09,2023-06-08
33713,7379,outfit.c5d0ed62000241acb220236b8c01dc44,2023-05-11,2023-06-10
10669,5572,outfit.efb3a9ca730340fb9a20064b034c225b,2023-02-21,2023-03-20
37998,3123,outfit.1b22d74ba7694fba9d7e8bbfd9aa6f29,2024-01-18,2024-02-17
20,6310,outfit.abf8682bf7784e73af89ebfd6d542df0,2022-06-07,2022-07-06
55049,3970,outfit.f7826f04ea8643d3b68964a5ab27b0a2,2023-05-14,2023-06-13
53254,55,outfit.a388427605b34a2ab0e19304e54facc6,2022-09-06,2022-10-05


In [None]:
orders.dtypes

customer.id                    int64
outfit.id                     object
rentalPeriod.start    datetime64[ns]
rentalPeriod.end      datetime64[ns]
dtype: object

In [None]:
orders.shape

(75733, 4)

In [None]:
orders.to_parquet(arcive_path+'/data/orders.parquet',engine='pyarrow')