In [1]:
import pandas as pd
import numpy as np
import datetime
import os

## Load Data
- session
  - drop : expired_at=NaN
  - add 9 hours : created_at, expired_at, login_at
- click
- view
- like
  - drop : is_deleted=True
  - add 9 hours : timestamp

In [2]:
def process_timestamp(df: pd.DataFrame, col: str):
    mask = df[col].notna()
    df.loc[mask, col] = pd.to_datetime(df.loc[mask, col]) + datetime.timedelta(hours=9)
    df.loc[mask, col] = df.loc[mask, col].apply(lambda x: x.strftime("%y-%m-%d %H:%M:%S"))

In [3]:
session = pd.read_csv("../data/session.csv")
session = session.dropna(subset=['expired_at'], axis=0).reset_index(drop=True)

for col in ['created_at', 'expired_at', 'login_at']:
    process_timestamp(session, col)

session

Unnamed: 0,session_id,user_id,created_at,login_at,expired_at
0,2774f33f-683c-4afb-ac8f-c62b0e201ae2,3.0,23-07-19 08:54:00,23-07-19 10:09:45,23-07-19 10:35:23
1,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,,23-07-19 08:44:35,,23-07-19 08:53:30
2,04a49069-ce63-4144-9eba-4c6a097281a0,,23-07-19 13:10:53,,23-07-19 13:16:21
3,7221dea7-71a9-4387-9a0e-96325662b63b,,23-07-19 08:44:17,,23-07-19 08:53:57
4,0fd3b801-76f1-4f4e-bafb-5884fbcfcf7f,,23-07-19 09:00:02,,23-07-22 13:11:17
...,...,...,...,...,...
346,84ac6b46-231c-4b30-8755-daa2e19f9661,,23-07-24 10:00:49,,23-07-24 10:03:04
347,4135551c-bc9f-4042-a25e-df4c8ccb9edb,,23-07-24 09:57:18,,23-07-24 09:58:19
348,1d7d47c0-1bef-448c-988c-dd28bece3289,63.0,23-07-22 15:28:37,23-07-22 15:28:47,23-07-24 11:18:23
349,c3c5ee77-ca84-4449-b691-3248e9bb1988,,23-07-24 09:53:58,,23-07-24 09:55:22


In [4]:
def load_log(log_file: str, log_dir: str="../data/logging"):
    log_dfs = []
    for folder in os.scandir(log_dir):
        if folder.is_dir():
            file_path = os.path.join(folder.path, log_file)
            
            if os.path.isfile(file_path):
                df = pd.read_csv(file_path)
                log_dfs.append(df)

    log = pd.concat(log_dfs, axis=0, ignore_index=True)
    
    return log

In [5]:
click = load_log("click_image_log.txt")
click

Unnamed: 0,session_id,user_id,outfit_id,timestamp,click_type
0,2774f33f-683c-4afb-ac8f-c62b0e201ae2,0,89735,23-07-19 08:54:09,journey
1,2774f33f-683c-4afb-ac8f-c62b0e201ae2,0,91725,23-07-19 08:55:09,journey
2,13bd900d-d267-402f-9e10-5061351524c3,0,79345,23-07-19 08:56:10,journey
3,13bd900d-d267-402f-9e10-5061351524c3,0,79382,23-07-19 08:56:18,similar
4,13bd900d-d267-402f-9e10-5061351524c3,0,91043,23-07-19 08:56:22,similar
...,...,...,...,...,...
1596,84ac6b46-231c-4b30-8755-daa2e19f9661,0,89652,23-07-24 10:01:20,similar
1597,84ac6b46-231c-4b30-8755-daa2e19f9661,0,81795,23-07-24 10:01:23,similar
1598,84ac6b46-231c-4b30-8755-daa2e19f9661,0,81315,23-07-24 10:01:25,similar
1599,84ac6b46-231c-4b30-8755-daa2e19f9661,0,85199,23-07-24 10:02:59,journey


In [6]:
view = load_log("view_image_log.txt")
view

Unnamed: 0,session_id,user_id,outfit_id,timestamp,view_type
0,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,0,83585,23-07-19 08:53:09,journey
1,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,0,84360,23-07-19 08:53:09,journey
2,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,0,79741,23-07-19 08:53:09,journey
3,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,0,78489,23-07-19 08:53:09,journey
4,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,0,84574,23-07-19 08:53:09,journey
...,...,...,...,...,...
38468,1d7d47c0-1bef-448c-988c-dd28bece3289,63,86099,23-07-24 11:18:23,journey
38469,1d7d47c0-1bef-448c-988c-dd28bece3289,63,90682,23-07-24 11:18:23,journey
38470,1d7d47c0-1bef-448c-988c-dd28bece3289,63,86717,23-07-24 11:18:23,journey
38471,1d7d47c0-1bef-448c-988c-dd28bece3289,63,73863,23-07-24 11:18:23,journey


In [14]:
like = pd.read_csv("../data/like.csv")
like = like[like['is_deleted']==False]
like['timestamp'] = pd.to_datetime(like['timestamp']) + datetime.timedelta(hours=9)
like['timestamp'] = pd.to_datetime(like['timestamp']).apply(lambda x: x.strftime("%y-%m-%d %H:%M:%S"))
like.drop(columns=["like_id", "like_type", 'is_deleted', "as_login"], inplace=True)
like = like.sort_values(by=['timestamp', 'session_id']).reset_index(drop=True)
like

Unnamed: 0,session_id,user_id,outfit_id,timestamp
0,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,,73016,23-07-19 08:53:25
1,66178898-dfb6-4de4-8f2a-cb09e3ccbe11,,79507,23-07-19 08:53:25
2,7221dea7-71a9-4387-9a0e-96325662b63b,,88572,23-07-19 08:53:53
3,7221dea7-71a9-4387-9a0e-96325662b63b,,88731,23-07-19 08:53:54
4,7221dea7-71a9-4387-9a0e-96325662b63b,,78346,23-07-19 08:53:55
...,...,...,...,...
1396,5faf94be-dc77-4133-aeff-20ec7c1b7511,,80814,23-07-24 10:00:13
1397,84ac6b46-231c-4b30-8755-daa2e19f9661,,80891,23-07-24 10:01:18
1398,84ac6b46-231c-4b30-8755-daa2e19f9661,,89652,23-07-24 10:01:21
1399,84ac6b46-231c-4b30-8755-daa2e19f9661,,81795,23-07-24 10:01:24


In [15]:
like['session_id'].value_counts()

session_id
2f8a5b9a-a409-45c6-b23b-24dc60c18542    75
8551f4b3-5754-47a5-8e0c-99919a5c8a74    47
fe3b84c2-5dba-4788-bf69-cb7b1ea170f0    46
748a4d06-89a2-49b6-b2ee-8dfd4109d579    30
052efac5-ad8c-4acd-bc41-1be5f1e83d1f    30
                                        ..
2889e5d1-7eb3-4ba3-b8eb-fab71e2687e0     1
530ac744-4f30-4412-b1f9-69341dbb21c7     1
87eb32e8-eb9a-453e-a58d-e70b14978c8d     1
ab95ebf6-1fa5-4aca-8c30-3c9dbb32d217     1
31ec8034-a87e-4a66-95bd-e3bf6d4a5abc     1
Name: count, Length: 211, dtype: int64

In [16]:
sum(like['session_id'].value_counts() < 3)

57

In [17]:
sum(like['session_id'].value_counts() >= 3)

154

In [18]:
like['outfit_id'].value_counts()

outfit_id
83513    3
82192    3
91240    3
73942    3
84765    3
        ..
78352    1
88504    1
82120    1
87595    1
81795    1
Name: count, Length: 1275, dtype: int64

## Process Outfit Data

In [19]:
outfit = pd.read_csv("../data/outfit.csv")[['outfit_id', 'gender', 'tags', 'style', 'date']]
outfit['date'] = outfit['date'].apply(lambda x: x[:10])
outfit

Unnamed: 0,outfit_id,gender,tags,style,date
0,64453,F,"{반팔 티셔츠,가을,스트릿}",스트릿,2020-09-17
1,68758,F,"{스니커즈,봄,캐주얼,나이키}",캐주얼,2021-02-24
2,71478,F,"{여성,셔츠,레더 스커트,워커,여름,캐주얼}",캐주얼,2021-06-01
3,71480,F,"{여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱}",로맨틱,2021-06-01
4,71482,M,"{남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿}",스트릿,2021-06-01
...,...,...,...,...,...
10424,92177,M,"{여름,스트릿}",스트릿,2023-06-29
10425,92178,F,"{여름,걸리시}",걸리시,2023-06-29
10426,92179,F,"{여름,스트릿}",스트릿,2023-06-29
10427,92181,F,"{여름,스트릿}",스트릿,2023-06-29


In [20]:
outfit['style'].value_counts()

style
캐주얼         4835
스트릿         2665
걸리시         1379
시크           831
댄디           204
로맨틱          136
아메리칸 캐주얼     134
레트로          129
포멀            79
스포츠           19
고프코어           2
골프             1
Name: count, dtype: int64

In [22]:
season_map = {'봄': 'spring', '여름': 'summer', '가을': 'fall', '겨울': 'winter'}
outfit['season'] = np.NaN
for 계절 in season_map.keys():
    outfit['season'] = np.where(outfit['tags'].str.contains(계절),
                                계절, outfit['season'])
    
outfit['season'].value_counts()

season
봄      3605
여름     3168
겨울     2178
가을     1474
nan       4
Name: count, dtype: int64

In [23]:
outfit[outfit['season']=="nan"]

Unnamed: 0,outfit_id,gender,tags,style,date,season
6078,85922,M,"{남성,무스탕}",캐주얼,2022-10-27,
6137,86018,F,"{올블랙,코팅,블레이저,워커화}",스트릿,2022-11-01,
6138,86019,F,"{올화이트,원피스,부츠}",걸리시,2022-11-01,
9786,91266,F,"{스트릿,여성}",스트릿,2023-05-24,


In [24]:
outfit.loc[6078, 'season'] = '가을'
outfit.loc[6137, 'season'] = '가을'
outfit.loc[6138, 'season'] = '가을'
outfit.loc[9786, 'season'] = '봄'

In [25]:
outfit

Unnamed: 0,outfit_id,gender,tags,style,date,season
0,64453,F,"{반팔 티셔츠,가을,스트릿}",스트릿,2020-09-17,가을
1,68758,F,"{스니커즈,봄,캐주얼,나이키}",캐주얼,2021-02-24,봄
2,71478,F,"{여성,셔츠,레더 스커트,워커,여름,캐주얼}",캐주얼,2021-06-01,여름
3,71480,F,"{여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱}",로맨틱,2021-06-01,여름
4,71482,M,"{남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿}",스트릿,2021-06-01,여름
...,...,...,...,...,...,...
10424,92177,M,"{여름,스트릿}",스트릿,2023-06-29,여름
10425,92178,F,"{여름,걸리시}",걸리시,2023-06-29,여름
10426,92179,F,"{여름,스트릿}",스트릿,2023-06-29,여름
10427,92181,F,"{여름,스트릿}",스트릿,2023-06-29,여름


In [26]:
outfit['tags'] = outfit['tags'].str.replace(r"[{}]", "", regex=True)
outfit

Unnamed: 0,outfit_id,gender,tags,style,date,season
0,64453,F,"반팔 티셔츠,가을,스트릿",스트릿,2020-09-17,가을
1,68758,F,"스니커즈,봄,캐주얼,나이키",캐주얼,2021-02-24,봄
2,71478,F,"여성,셔츠,레더 스커트,워커,여름,캐주얼",캐주얼,2021-06-01,여름
3,71480,F,"여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱",로맨틱,2021-06-01,여름
4,71482,M,"남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿",스트릿,2021-06-01,여름
...,...,...,...,...,...,...
10424,92177,M,"여름,스트릿",스트릿,2023-06-29,여름
10425,92178,F,"여름,걸리시",걸리시,2023-06-29,여름
10426,92179,F,"여름,스트릿",스트릿,2023-06-29,여름
10427,92181,F,"여름,스트릿",스트릿,2023-06-29,여름


In [29]:
mappings = {
    'ㄴ마성': '남성',
    '긴팔티셔츠': '긴소매티셔츠',
    '농구저지': '농구져지',
    '남자': '남성',
    '데님.숏팬츠': '데님숏팬츠',
    '디그낙12': '디그낙',
    '맥스1': '맥스',
    '무톤재킷': '무통재킷',
    '밀란패션위크': '밀라노패션위크',
    '블래이저': '블래이져',
    '반팔티셔츠숏팬츠': '반팔티셔츠,숏팬츠',
    '스니커즈. 에코 백': '스니커즈,에코백',
    '스니컺': '스니커즈',
    '스니키진': '스키니진',
    '스웨트': '스웨터',
    '스웻팬츠': '스웨트팬츠',
    '슬랙스팬츠': '슬랙스',
    '여자': '여성',
    '여자원피스': '여성,원피스',
    '저지': '져지',
    '집시.ma-1': '집시,ma-1',
    '캡프캡': '캠프캡',
    'converse': '컨버스',
    'crocs': '크록스',
    'dolce&gabbana': '돌체앤가바나',
    'freitag': '프라이탁',
    'levi': '리바이스',
    'nike': '나이키',
    'prada': '프라다',
    'vans': '반스',
    'wildbricks': '와일드브릭스',
    'switfish': '스윗피쉬',
    'solelab': '솔랩',
    'taw&toe': '토앤토',
}

def process_tags(tags: str, mappings: dict) -> list:
    tags = tags.lower().replace(' ', '')
    tags = tags.split(',')
    tags = [mappings.get(tag, tag) for tag in tags]
    
    return tags

outfit['new_tags'] = outfit['tags'].apply(lambda x: process_tags(x, mappings))
outfit

Unnamed: 0,outfit_id,gender,tags,style,date,season,new_tags
0,64453,F,"반팔 티셔츠,가을,스트릿",스트릿,2020-09-17,가을,"[반팔티셔츠, 가을, 스트릿]"
1,68758,F,"스니커즈,봄,캐주얼,나이키",캐주얼,2021-02-24,봄,"[스니커즈, 봄, 캐주얼, 나이키]"
2,71478,F,"여성,셔츠,레더 스커트,워커,여름,캐주얼",캐주얼,2021-06-01,여름,"[여성, 셔츠, 레더스커트, 워커, 여름, 캐주얼]"
3,71480,F,"여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱",로맨틱,2021-06-01,여름,"[여성, 슬리브리스, 트랙팬츠, 스니커즈, 숄더백, 나이키, 여름, 로맨틱]"
4,71482,M,"남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿",스트릿,2021-06-01,여름,"[남성, 비니, 베스트, 크로스백, 데님팬츠, 스니커즈, 나이키, 여름, 스트릿]"
...,...,...,...,...,...,...,...
10424,92177,M,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]"
10425,92178,F,"여름,걸리시",걸리시,2023-06-29,여름,"[여름, 걸리시]"
10426,92179,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]"
10427,92181,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]"


In [30]:
outfit.isnull().sum()

outfit_id     0
gender        0
tags          0
style        15
date          0
season        0
new_tags      0
dtype: int64

In [31]:
outfit['style'] = outfit['style'].fillna('기타') 
outfit.isnull().sum()

outfit_id    0
gender       0
tags         0
style        0
date         0
season       0
new_tags     0
dtype: int64

In [41]:
# style이 태그에 안 들어가 있으면 포함시켜줌
outfit['style'] = outfit['style'].apply(lambda x: x.replace(' ', ''))
def style_in_tags(row) -> list:
    new_tags = row['new_tags'].copy()
    if row['style'] not in row['new_tags']:
        new_tags.append(row['style'])

    return new_tags

outfit['style_in_tags'] = outfit.apply(style_in_tags, axis=1)
outfit

Unnamed: 0,outfit_id,gender,tags,style,date,season,new_tags,style_in_tags
0,64453,F,"반팔 티셔츠,가을,스트릿",스트릿,2020-09-17,가을,"[반팔티셔츠, 가을, 스트릿]","[반팔티셔츠, 가을, 스트릿]"
1,68758,F,"스니커즈,봄,캐주얼,나이키",캐주얼,2021-02-24,봄,"[스니커즈, 봄, 캐주얼, 나이키]","[스니커즈, 봄, 캐주얼, 나이키]"
2,71478,F,"여성,셔츠,레더 스커트,워커,여름,캐주얼",캐주얼,2021-06-01,여름,"[여성, 셔츠, 레더스커트, 워커, 여름, 캐주얼]","[여성, 셔츠, 레더스커트, 워커, 여름, 캐주얼]"
3,71480,F,"여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱",로맨틱,2021-06-01,여름,"[여성, 슬리브리스, 트랙팬츠, 스니커즈, 숄더백, 나이키, 여름, 로맨틱]","[여성, 슬리브리스, 트랙팬츠, 스니커즈, 숄더백, 나이키, 여름, 로맨틱]"
4,71482,M,"남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿",스트릿,2021-06-01,여름,"[남성, 비니, 베스트, 크로스백, 데님팬츠, 스니커즈, 나이키, 여름, 스트릿]","[남성, 비니, 베스트, 크로스백, 데님팬츠, 스니커즈, 나이키, 여름, 스트릿]"
...,...,...,...,...,...,...,...,...
10424,92177,M,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]"
10425,92178,F,"여름,걸리시",걸리시,2023-06-29,여름,"[여름, 걸리시]","[여름, 걸리시]"
10426,92179,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]"
10427,92181,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]"


In [None]:
# 태그 확인
set().union(*outfit['style_in_tags'].values)

In [42]:
outfit['style'].value_counts()

style
캐주얼        4835
스트릿        2665
걸리시        1379
시크          831
댄디          204
로맨틱         136
아메리칸캐주얼     134
레트로         129
포멀           79
스포츠          19
기타           15
고프코어          2
골프            1
Name: count, dtype: int64

In [46]:
# 어차피 성별 구분해서 cluster하므로 태그에서 성별 제거
def remove_gender(row):
    gender = {'여성', '남성'}
    new_tags = set(row['style_in_tags'].copy())
    return ",".join(new_tags - gender)

outfit['no_gender'] = outfit.apply(remove_gender, axis=1)
outfit

Unnamed: 0,outfit_id,gender,tags,style,date,season,new_tags,style_in_tags,no_gender
0,64453,F,"반팔 티셔츠,가을,스트릿",스트릿,2020-09-17,가을,"[반팔티셔츠, 가을, 스트릿]","[반팔티셔츠, 가을, 스트릿]","가을,반팔티셔츠,스트릿"
1,68758,F,"스니커즈,봄,캐주얼,나이키",캐주얼,2021-02-24,봄,"[스니커즈, 봄, 캐주얼, 나이키]","[스니커즈, 봄, 캐주얼, 나이키]","스니커즈,캐주얼,나이키,봄"
2,71478,F,"여성,셔츠,레더 스커트,워커,여름,캐주얼",캐주얼,2021-06-01,여름,"[여성, 셔츠, 레더스커트, 워커, 여름, 캐주얼]","[여성, 셔츠, 레더스커트, 워커, 여름, 캐주얼]","여름,워커,셔츠,레더스커트,캐주얼"
3,71480,F,"여성,슬리브리스,트랙 팬츠,스니커즈,숄더 백,나이키,여름,로맨틱",로맨틱,2021-06-01,여름,"[여성, 슬리브리스, 트랙팬츠, 스니커즈, 숄더백, 나이키, 여름, 로맨틱]","[여성, 슬리브리스, 트랙팬츠, 스니커즈, 숄더백, 나이키, 여름, 로맨틱]","로맨틱,여름,슬리브리스,스니커즈,숄더백,나이키,트랙팬츠"
4,71482,M,"남성,비니,베스트,크로스 백,데님 팬츠,스니커즈,나이키,여름,스트릿",스트릿,2021-06-01,여름,"[남성, 비니, 베스트, 크로스백, 데님팬츠, 스니커즈, 나이키, 여름, 스트릿]","[남성, 비니, 베스트, 크로스백, 데님팬츠, 스니커즈, 나이키, 여름, 스트릿]","여름,스트릿,비니,데님팬츠,크로스백,스니커즈,나이키,베스트"
...,...,...,...,...,...,...,...,...,...
10424,92177,M,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]","여름,스트릿"
10425,92178,F,"여름,걸리시",걸리시,2023-06-29,여름,"[여름, 걸리시]","[여름, 걸리시]","여름,걸리시"
10426,92179,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]","여름,스트릿"
10427,92181,F,"여름,스트릿",스트릿,2023-06-29,여름,"[여름, 스트릿]","[여름, 스트릿]","여름,스트릿"


In [49]:
outfit_2 = outfit.drop(columns=['tags', 'date', 'new_tags', 'style_in_tags'])
outfit_2.rename(columns={'no_gender': 'tags'}, inplace=True)
outfit_2

Unnamed: 0,outfit_id,gender,style,season,tags
0,64453,F,스트릿,가을,"가을,반팔티셔츠,스트릿"
1,68758,F,캐주얼,봄,"스니커즈,캐주얼,나이키,봄"
2,71478,F,캐주얼,여름,"여름,워커,셔츠,레더스커트,캐주얼"
3,71480,F,로맨틱,여름,"로맨틱,여름,슬리브리스,스니커즈,숄더백,나이키,트랙팬츠"
4,71482,M,스트릿,여름,"여름,스트릿,비니,데님팬츠,크로스백,스니커즈,나이키,베스트"
...,...,...,...,...,...
10424,92177,M,스트릿,여름,"여름,스트릿"
10425,92178,F,걸리시,여름,"여름,걸리시"
10426,92179,F,스트릿,여름,"여름,스트릿"
10427,92181,F,스트릿,여름,"여름,스트릿"


In [50]:
# 고프코어, 골프 기타로 합침
outfit_2['style'] = outfit_2['style'].apply(lambda x: '기타' if x in ['고프코어', '골프'] else x)
outfit_2['style'].value_counts()

style
캐주얼        4835
스트릿        2665
걸리시        1379
시크          831
댄디          204
로맨틱         136
아메리칸캐주얼     134
레트로         129
포멀           79
스포츠          19
기타           18
Name: count, dtype: int64

In [52]:
outfit_2.groupby('style')['gender'].value_counts().sort_index()

style    gender
걸리시      F         1377
         M            2
기타       F           10
         M            8
댄디       F            3
         M          201
레트로      F          111
         M           18
로맨틱      F          136
스트릿      F         1665
         M         1000
스포츠      F           12
         M            7
시크       F          817
         M           14
아메리칸캐주얼  F           33
         M          101
캐주얼      F         3165
         M         1670
포멀       F           52
         M           27
Name: count, dtype: int64

In [53]:
# 성별+스타일 합쳐서 새 스타일 카테고리 생성
def new_style(row):
    style = row['style']
    gender = row['gender']
    new = ""
    # 남자 걸리시와 여자 댄디는 수가 적으므로 기타로 취급
    if style == '걸리시' and gender == 'M':
        new = 'M_기타'
    elif style == '댄디' and gender == 'F':
        new = 'F_기타'
    else:
        new = gender + "_" + style
    return new

outfit_2['new_style'] = outfit_2.apply(new_style, axis=1)
outfit_2

Unnamed: 0,outfit_id,gender,style,season,tags,new_style
0,64453,F,스트릿,가을,"가을,반팔티셔츠,스트릿",F_스트릿
1,68758,F,캐주얼,봄,"스니커즈,캐주얼,나이키,봄",F_캐주얼
2,71478,F,캐주얼,여름,"여름,워커,셔츠,레더스커트,캐주얼",F_캐주얼
3,71480,F,로맨틱,여름,"로맨틱,여름,슬리브리스,스니커즈,숄더백,나이키,트랙팬츠",F_로맨틱
4,71482,M,스트릿,여름,"여름,스트릿,비니,데님팬츠,크로스백,스니커즈,나이키,베스트",M_스트릿
...,...,...,...,...,...,...
10424,92177,M,스트릿,여름,"여름,스트릿",M_스트릿
10425,92178,F,걸리시,여름,"여름,걸리시",F_걸리시
10426,92179,F,스트릿,여름,"여름,스트릿",F_스트릿
10427,92181,F,스트릿,여름,"여름,스트릿",F_스트릿


In [54]:
outfit_2['new_style'].value_counts()

new_style
F_캐주얼        3165
M_캐주얼        1670
F_스트릿        1665
F_걸리시        1377
M_스트릿        1000
F_시크          817
M_댄디          201
F_로맨틱         136
F_레트로         111
M_아메리칸캐주얼     101
F_포멀           52
F_아메리칸캐주얼      33
M_포멀           27
M_레트로          18
M_시크           14
F_기타           13
F_스포츠          12
M_기타           10
M_스포츠           7
Name: count, dtype: int64

In [55]:
style_map = {v: i for i, v in enumerate(outfit_2['new_style'].unique())}
outfit_2['style_id'] = outfit_2['new_style'].map(style_map)
# outfit_2['style_id'].value_counts().sort_index()
outfit_2

Unnamed: 0,outfit_id,gender,style,season,tags,new_style,style_id
0,64453,F,스트릿,가을,"가을,반팔티셔츠,스트릿",F_스트릿,0
1,68758,F,캐주얼,봄,"스니커즈,캐주얼,나이키,봄",F_캐주얼,1
2,71478,F,캐주얼,여름,"여름,워커,셔츠,레더스커트,캐주얼",F_캐주얼,1
3,71480,F,로맨틱,여름,"로맨틱,여름,슬리브리스,스니커즈,숄더백,나이키,트랙팬츠",F_로맨틱,2
4,71482,M,스트릿,여름,"여름,스트릿,비니,데님팬츠,크로스백,스니커즈,나이키,베스트",M_스트릿,3
...,...,...,...,...,...,...,...
10424,92177,M,스트릿,여름,"여름,스트릿",M_스트릿,3
10425,92178,F,걸리시,여름,"여름,걸리시",F_걸리시,4
10426,92179,F,스트릿,여름,"여름,스트릿",F_스트릿,0
10427,92181,F,스트릿,여름,"여름,스트릿",F_스트릿,0


In [56]:
outfit_2.to_csv("./outfit_2.csv", index=False)

## Like에 적용