In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os

# 1. **브랜드**정보 추출하기(Product_df)


In [4]:
# product table 가져오기
base_path = os.getcwd()
product_df = pd.read_csv(base_path + '/product.csv')
product_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt


#### a. 전처리 및 결측치 처리

In [5]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44446 entries, 0 to 44445
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44446 non-null  int64  
 1   gender              44446 non-null  object 
 2   masterCategory      44446 non-null  object 
 3   subCategory         44446 non-null  object 
 4   articleType         44446 non-null  object 
 5   baseColour          44431 non-null  object 
 6   season              44425 non-null  object 
 7   year                44445 non-null  float64
 8   usage               44129 non-null  object 
 9   productDisplayName  44439 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.4+ MB


In [8]:
product_df.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
dtype: int64

In [9]:
product_df.duplicated().sum()

0

In [11]:
# 결측치 제거
product_df.dropna(inplace=True)

In [12]:
product_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44099 entries, 0 to 44445
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  44099 non-null  int64  
 1   gender              44099 non-null  object 
 2   masterCategory      44099 non-null  object 
 3   subCategory         44099 non-null  object 
 4   articleType         44099 non-null  object 
 5   baseColour          44099 non-null  object 
 6   season              44099 non-null  object 
 7   year                44099 non-null  float64
 8   usage               44099 non-null  object 
 9   productDisplayName  44099 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 3.7+ MB


#### b. productDisplayName 활용하기

##### productDisplayName에서 gender를 기준으로 하여 brand를 뽑아낼 수 있을 것이라 예상

In [25]:
# gender 확인
product_df.gender.unique()

array(['Men', 'Women', 'Boys', 'Girls', 'Unisex'], dtype=object)

In [13]:
product_df.productDisplayName

0                       Turtle Check Men Navy Blue Shirt
1                     Peter England Men Party Blue Jeans
2                               Titan Women Silver Watch
3          Manchester United Men Solid Black Track Pants
4                                  Puma Men Grey T-shirt
                              ...                       
44441                          Gas Men Caddy Casual Shoe
44442                 Lotto Men's Soccer Track Flip Flop
44443               Puma Men Graphic Stellar Blue Tshirt
44444                     Rasasi Women Blue Lady Perfume
44445    Fossil Women Pink Dial Chronograph Watch ES3050
Name: productDisplayName, Length: 44099, dtype: object

In [59]:
pd.set_option('display.max_rows', None)

In [67]:
# 데이터 확인 후 최대한 구분 기준을 세부화했으나 약 3000개 정도 구분점 설정 실패 -> 그냥 첫번째 글자를 브랜드명으로 쓰기로 결정
brand_1 = product_df[product_df.productDisplayName.str.contains(r'Men|Man|Women|Woman|Boys|Boy\'s|Boy|Girls|Girl|Girl\'s|Kids|Unisex')]['productDisplayName'].str.split().str[0]
len(brand_1)

41091

##### productDisplayName에서 첫번째 단어를 브랜드명으로 간주

In [69]:
brand = product_df.productDisplayName.str.split().str[0]
brand.head()

0        Turtle
1         Peter
2         Titan
3    Manchester
4          Puma
Name: productDisplayName, dtype: object

In [70]:
brand.value_counts()

Nike                                            2289
Puma                                            2087
ADIDAS                                          2071
United                                          1400
Fabindia                                         751
Catwalk                                          743
Gini                                             721
Arrow                                            697
Fastrack                                         689
Wrangler                                         678
Lino                                             655
Baggit                                           623
Red                                              547
Lee                                              521
Myntra                                           476
Scullers                                         455
French                                           430
Fila                                             423
Jealous                                       

In [99]:
brand = brand.str.lower()

#### 2021년 인도네시아 18세 ~ 55세 설문조사 결과를 토대로 인기있는 상위 10개 브랜드가 이커머스에 입점되어 있는지 확인 

In [103]:
# 이커머스 회사의 고객 타겟층인 18세에서 35세와 다소 차이가 있을 수 있으나, 참고 목적으로 살펴볼 예정
    # 출처 : https://welcon.kocca.kr/cmm/fms/CrawlingFileDown.do?atchFileId=FILE_f016d5ef-6951-4e09-9cf0-8f513a360671&fileSn=1
for famous_brand in ['adidas', 'h&m', 'converse', 'uniqlo', 'nike', 'eiger', '3second', 'nevada', 'vans', 'cardinal']:
    print(f'{famous_brand} :', len(brand[brand.str.contains(famous_brand)]))

adidas : 2080
h&m : 0
converse : 174
uniqlo : 0
nike : 2289
eiger : 0
3second : 0
nevada : 0
vans : 196
cardinal : 0


In [None]:
# 문제점 : 인도네시아에서 인기있는 브랜드인 adidas, nike를 제외하고 입점이 안되어있거나 제품 수가 매우 적음