In [None]:
import pandas as pd
import numpy as np
import datetime
import re
from collections import Counter

## Load Data
- session
  - drop : expired_at=NaN
  - add 9 hours : created_at, expired_at, login_at
- click
- view
- like
  - drop : is_deleted=True
  - add 9 hours : timestamp

In [None]:
def process_timestamp(df: pd.DataFrame, col: str):
    mask = df[col].notna()
    df.loc[mask, col] = pd.to_datetime(df.loc[mask, col]) + datetime.timedelta(hours=9)
    df.loc[mask, col] = df.loc[mask, col].apply(lambda x: x.strftime("%y-%m-%d %H:%M:%S"))

In [None]:
session = pd.read_csv("../data/session.csv")
# session = session.dropna(subset=['expired_at'], axis=0).reset_index(drop=True)

for col in ['created_at', 'expired_at', 'login_at']:
    process_timestamp(session, col)

session

In [None]:
click_0719 = pd.read_csv("../data/2023-07-19/click_image_log.txt")
click_0720 = pd.read_csv("../data/2023-07-20/click_image_log.txt")
click_0721 = pd.read_csv("../data/2023-07-21/click_image_log.txt")
click = pd.concat([click_0719, click_0720, click_0721], axis=0, ignore_index=True)
click

In [None]:
view_0719 = pd.read_csv("../data/2023-07-19/view_image_log.txt")
view_0720 = pd.read_csv("../data/2023-07-20/view_image_log.txt")
view_0721 = pd.read_csv("../data/2023-07-21/view_image_log.txt")
view = pd.concat([view_0719, view_0720, view_0721], axis=0, ignore_index=True)
view

In [None]:
like = pd.read_csv("../data/like.csv")
like = like[like['is_deleted']==False]
like['timestamp'] = pd.to_datetime(like['timestamp']) + datetime.timedelta(hours=9)
like['timestamp'] = pd.to_datetime(like['timestamp']).apply(lambda x: x.strftime("%y-%m-%d %H:%M:%S"))
like = like.drop(columns=["like_id"]).reset_index(drop=True)
like

In [None]:
like_click = pd.concat([like, click], axis=0, ignore_index=True)
like_click

In [None]:
like_click = like_click.drop_duplicates(subset=['session_id', 'outfit_id'], keep='last')
like_click

In [None]:
like_click['session_id'].value_counts().hist(bins=15)

In [None]:
sum(like_click['session_id'].value_counts() < 2)

In [None]:
view['session_id'].value_counts()

## Load Metadata

In [None]:
outfit = pd.read_csv("../data/outfit.csv")
outfit.drop(columns=['img_url', 'origin_url', 'reporter'], inplace=True)
outfit

In [None]:
outfit['style'].value_counts()

In [None]:
# def check_string_inclusion(df: pd.DataFrame, col: str, pattern: str):
#     return df[col].str.contains(pattern).astype(int)

# season_map = {'봄': 'spring', '여름': 'summer', '가을': 'fall', '겨울': 'winter'}
# for 계절, season in season_map.items():
#     outfit[season] = check_string_inclusion(outfit, 'tags', 계절)
# outfit

In [None]:
season_map = {'봄': 'spring', '여름': 'summer', '가을': 'fall', '겨울': 'winter'}
outfit['season'] = np.nan
for 계절 in season_map.keys():
    outfit['season'] = np.where(outfit['tags'].str.contains(계절),
                                계절, outfit['season'])
outfit

In [None]:
outfit['season'].value_counts()

In [None]:
outfit['year'] = outfit['date'].str[:4]
outfit['year'].value_counts()

In [None]:
outfit['brands'] = outfit['brands'].str.replace(r"[{}]", "", regex=True)
outfit['tags'] = outfit['tags'].str.replace(r"[{}]", "", regex=True)
outfit

In [None]:
outfit['gender'] = outfit['gender'].map({'F': '여성', 'M': '남성'})
outfit['gender'].value_counts()

In [None]:
outfit['brands'].fillna('unknown', inplace=True)
outfit['style'].fillna('unknown', inplace=True)
outfit.isnull().sum()

In [None]:
def process_tags(row):
    tags = row['tags']
    gender = row['gender']
    brands = row['brands'].split(",")
    style = row['style']
    season = row['season']

    for brand in brands:
        tags = tags.replace(brand, "")
    tags = tags.replace(gender, "").replace(style, "").replace(season, "")
    
    return tags


outfit['tags_'] = outfit.apply(process_tags, axis=1)

In [None]:
outfit

In [None]:
outfit['brands'].value_counts()

In [None]:
outfit['style'].value_counts(normalize=True)

In [None]:
outfit[outfit['brands']=='unknown']['style'].value_counts(normalize=True)