In [2]:
import pandas as pd
import numpy as np
import os
import math


from pyspark.sql.session import SparkSession
# spark세션 인스턴스
spark = SparkSession.builder.appName('master').getOrCreate()

from pyspark.sql.types import *
from pyspark.sql.functions import desc, asc
from pyspark.sql import functions as f
from pyspark.sql.functions import col

In [3]:
from pywebhdfs.webhdfs import PyWebHdfsClient
from pprint import pprint
 

# path안에 있는 파일 및 폴더이름을 리스트로 반환한다
# 단 path는 'user/hadoop/data_analysis/coupang_data/' 와 같이 http를 기입하지 않는다
def get_dir(path):
    hdfs = PyWebHdfsClient(host='localhost',port='50070', user_name='master')  # your Namenode IP & username here
    my_dir = path
    
    result = []
    
    for i in range(len(hdfs.list_dir(my_dir)['FileStatuses']['FileStatus'])):
        result.append(hdfs.list_dir(my_dir)['FileStatuses']['FileStatus'][i]['pathSuffix'])
    
    return result

# pandas dataframe으로 csv파일을 열고 dataframe을 반환한다
# 단 path는 'user/' 로 시작
def open_pd_csv(path):
    file_path = 'hdfs://192.168.0.9:9000/' + path
    data = spark.read.csv(file_path, header=True)
    result = data.toPandas()
    
    return result

# pandas dataframe을 path에 저장한다
# 단 path는 'user/' 로 시작 , 파일명까지 포함 ex) uesr/hadoop/test.csv
def save_pd_csv(df ,path):
    file_path = 'hdfs://192.168.0.9:9000/' + path
    data = spark.createDataFrame(df)
    data.coalesce(1).write.mode("overwrite").option("header","true").csv(file_path)


    
def save_pd_parquet(df ,path):
    file_path = 'hdfs://192.168.0.9:9000/' + path
    data = spark.createDataFrame(df)
    data.write.mode("overwrite").format("parquet").save(file_path)

    
# spark datafrmae schema 설정
def ad_schema(df):
    intCols = ['rank','category','price', 
               'discount_percentage', 'rating_total_count',
               'reviews_for_last1year', 'sales']
    doubleCols = ['rating']
    boolCols = ['rocket_delivery', 'is_out_of_stock']
        
    for c in df.columns:
        type_col = StringType()
        if c in intCols:
            type_col = IntegerType()
        elif c in doubleCols:
            type_col = DoubleType()
        elif c in boolCols:
            type_col = BooleanType()
        
        df = df.withColumn(c,df[c].cast(type_col))
    df = df.fillna(0)
    return df

In [None]:
# merge_csv
path_dir = 'user/hadoop/data_analysis/coupang_data/'
dir_list = get_dir(path_dir)

for d in dir_list:
    base = path_dir + d + '/'
    file_list = get_dir(base)
    #파일목록에서 py파일 삭제
    for f in file_list[:15]:
        if '.py' in f:
            file_list.remove(f)
    
    if 'getReviewData.py' in file_list:
        file_list.remove('getReviewData.py')

    
    # 상품정보 취합
    data = open_pd_csv(base + file_list[0])
    
    for file in file_list[1:9]:
        temp = open_pd_csv(base + file)
        data = pd.concat([data,temp])

        # 인덱스 리셋(1~120까지 반복되는 현상 때문에 작성)
        data = data.reset_index()
        data = data.drop(['index'],axis=1)
    
    # result.csv 목록에서 삭제
    del file_list[len(file_list)-1]
    
    # 리뷰정보 취합
    data2 = open_pd_csv(base + file_list[9])
    
    k = 10
    
    for file in file_list[10:]:
        if (file == 'result.csv') or (file == 'result_review.csv'):
            continue
        print(base, file, k)
        
        #temp = pd.read_csv(base + file, error_bad_lines=False)
        temp = open_pd_csv(base + file)
        data2 = pd.concat([data2,temp])
        
        temp = None
        
        # 인덱스 리셋(1~120까지 반복되는 현상 때문에 작성)
        data2 = data2.reset_index()
        data2 = data2.drop(['index'],axis=1)
        k+=1
    
        
    #result 데이터프레임에 상품별 리뷰정보 추가
    product_id = file[14:file.find('csv')-1]
    df_temp = data2[data2['reg_date'] > '2020.06.01'] # 2020년 6월 1일이후 리뷰만 필터
    df_temp = df_temp.groupby('product_id').agg(['count'])
    df_temp = df_temp.ratings
    data = pd.merge(left = data, right = df_temp, how = 'left', on = 'product_id')
    
    # 중복 제거
    data = data.drop_duplicates(['product_id'])

    # 상품별 상품정보 취합된 csv파일 저장
    save_pd_parquet(data ,base + 'transformed.parquet')
    
    # 상품별 상품정보 취합된 csv파일 저장
    save_pd_csv(data ,base + 'result.csv')
    data = None
    #result_review.csv에 정상적으로 저장이 안됨
    #save_pd_csv(data2 ,base + 'result_review.csv')
    data2 = None

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_102407096.csv 10
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_102580500.csv 11
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_104390448.csv 12
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1058415715.csv 13
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1058913221.csv 14
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1060193441.csv 15
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_10607696.csv 16
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1062838712.csv 17
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1063372010.csv 18
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1065945065.csv 19
user/hadoop/data_analysis/coupang_data/bed (c

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_128301263.csv 96
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_128659939.csv 97
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1291494178.csv 98
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1291494197.csv 99
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_129481818.csv 100
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_129944613.csv 101
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_130006662.csv 102
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1301183620.csv 103
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1306926674.csv 104
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1310197751.csv 105
user/hadoop/data_analysis/coupang_data/

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1588938873.csv 181
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1597728939.csv 182
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1598404584.csv 183
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_160136871.csv 184
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1604677955.csv 185
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1606666689.csv 186
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1610964169.csv 187
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_161265796.csv 188
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_161349239.csv 189
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1614715960.csv 190
user/hadoop/data_analysis/coupang

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1826430249.csv 267
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_182789092.csv 268
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1829493113.csv 269
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1829493909.csv 270
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1831465812.csv 271
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1833845262.csv 272
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_18342378.csv 273
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_1837047118.csv 274
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_184631206.csv 275
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_184631209.csv 276
user/hadoop/data_analysis/coupang_d

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2082412064.csv 352
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2083440667.csv 353
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2086361746.csv 354
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2086389955.csv 355
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2087297107.csv 356
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2087915123.csv 357
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2091558146.csv 358
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2093548879.csv 359
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2104336801.csv 360
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_2112099563.csv 361
user/hadoop/data_analysis/coup

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_245352117.csv 438
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_245874278.csv 439
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_247086161.csv 440
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_24718252.csv 441
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_247289441.csv 442
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_247648836.csv 443
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_252308539.csv 444
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_253085041.csv 445
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_255068544.csv 446
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_256005383.csv 447
user/hadoop/data_analysis/coupang_data/be

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_335188174.csv 525
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_336756699.csv 526
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338415121.csv 527
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338427421.csv 528
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338461327.csv 529
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338474141.csv 530
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338530660.csv 531
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_338544702.csv 532
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_339149191.csv 533
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_3393767.csv 534
user/hadoop/data_analysis/coupang_data/bed

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995185.csv 612
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995234.csv 613
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995534.csv 614
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995630.csv 615
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995773.csv 616
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4528995926.csv 617
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4529393727.csv 618
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4533466302.csv 619
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4533466334.csv 620
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4533466448.csv 621
user/hadoop/data_analysis/coup

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4924275304.csv 697
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4925678102.csv 698
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_49300701.csv 699
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4947697756.csv 700
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_496052.csv 701
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_4978094332.csv 702
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5002577869.csv 703
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5008105531.csv 704
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5008197391.csv 705
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5008203767.csv 706
user/hadoop/data_analysis/coupang_da

user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5464607.csv 782
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5465766552.csv 783
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5471851597.csv 784
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5471866788.csv 785
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5473018039.csv 786
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5486280911.csv 787
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_54912201.csv 788
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_54968438.csv 789
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5497116419.csv 790
user/hadoop/data_analysis/coupang_data/bed (category = 184562)/ output_review_5500831.csv 791
user/hadoop/data_analysis/coupang_data/b

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1464083772.csv 29
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1487124069.csv 30
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_148721141.csv 31
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_150583557.csv 32
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1506065644.csv 33
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_152155285.csv 34
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1523563718.csv 35
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1523563729.csv 36
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1547791319.csv 37
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_1552973920.csv 38
use

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2149933789.csv 112
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2176438761.csv 113
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2176439102.csv 114
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2176439195.csv 115
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2206982954.csv 116
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2217295763.csv 117
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2235102988.csv 118
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2236878306.csv 119
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_2243313148.csv 120
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_224407855

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4690006156.csv 195
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4705096055.csv 196
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4705099742.csv 197
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4709010221.csv 198
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4717725821.csv 199
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4717941061.csv 200
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4723920619.csv 201
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4732761288.csv 202
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4735757744.csv 203
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_473740036

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4980112655.csv 277
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4983237823.csv 278
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4984789441.csv 279
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4988450550.csv 280
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4989136263.csv 281
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4989885163.csv 282
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4992495410.csv 283
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4993186579.csv 284
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_4997044004.csv 285
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_499818748

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5216464674.csv 359
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5219066914.csv 360
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5219430151.csv 361
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5219892315.csv 362
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5221253858.csv 363
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5222029910.csv 364
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5223092620.csv 365
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5225764736.csv 366
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5226616714.csv 367
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_522661967

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5353492166.csv 442
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5353620013.csv 443
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5361810924.csv 444
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5362450559.csv 445
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5362657705.csv 446
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5366420364.csv 447
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5366420773.csv 448
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5370763183.csv 449
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5383646199.csv 450
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_538670763

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5484328616.csv 524
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5484565447.csv 525
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5485165247.csv 526
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5485922488.csv 527
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5486435577.csv 528
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5486594120.csv 529
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5488917539.csv 530
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5489065708.csv 531
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5489281499.csv 532
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_548929748

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5526706816.csv 606
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5526706982.csv 607
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5526707319.csv 608
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5526708398.csv 609
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5527728570.csv 610
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5528392763.csv 611
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5528459913.csv 612
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5528459923.csv 613
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5528460329.csv 614
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_552846042

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5544972061.csv 689
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5545245929.csv 690
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5545273871.csv 691
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5547404046.csv 692
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5547807124.csv 693
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5548333196.csv 694
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5548336727.csv 695
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5548340662.csv 696
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5550000919.csv 697
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_555043096

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571132495.csv 771
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571313330.csv 772
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571412157.csv 773
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571566727.csv 774
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571650700.csv 775
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571690002.csv 776
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571765295.csv 777
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5571838628.csv 778
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5572014673.csv 779
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_557209715

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5580686309.csv 853
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5581206830.csv 854
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5581234616.csv 855
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5581544364.csv 856
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5582063903.csv 857
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5582449988.csv 858
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5583963813.csv 859
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5584878290.csv 860
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5584918180.csv 861
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_558499923

user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_5595762043.csv 936
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_56916369.csv 937
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_59417739.csv 938
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_62351060.csv 939
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_62730960.csv 940
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_7057753.csv 941
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_85886731.csv 942
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_86160914.csv 943
user/hadoop/data_analysis/coupang_data/bedtray (category = 503219)/ output_review_92173073.csv 944
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_101848533.csv 10
user/hadoop

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1338687103.csv 86
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1338931966.csv 87
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1349109493.csv 88
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_135145900.csv 89
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_135145920.csv 90
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_135145945.csv 91
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_135145955.csv 92
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1353237056.csv 93
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1353237308.csv 94
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1353238090.csv 95
user/hadoop/data_analysi

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_157781447.csv 171
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1583888110.csv 172
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_159014920.csv 173
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_159015020.csv 174
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_159285263.csv 175
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1597558688.csv 176
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_160049060.csv 177
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1601932551.csv 178
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1606702.csv 179
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1611554.csv 180
user/hadoop/data_anal

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179229615.csv 256
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179229620.csv 257
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179229628.csv 258
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179229645.csv 259
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179229655.csv 260
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1794715.csv 261
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_179506063.csv 262
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1800635228.csv 263
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1800690628.csv 264
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_1805169593.csv 265
user/hadoop/data_an

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2022424919.csv 340
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2022424921.csv 341
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2022424989.csv 342
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_202312696.csv 343
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_202312712.csv 344
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2029752273.csv 345
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_203130575.csv 346
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2034128053.csv 347
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2034836684.csv 348
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2044227453.csv 349
user/hadoop/d

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2341046751.csv 424
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_234170079.csv 425
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2353946276.csv 426
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_235559353.csv 427
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_236997139.csv 428
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_237076176.csv 429
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_237088176.csv 430
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2371467905.csv 431
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_2371470154.csv 432
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_23755350.csv 433
user/hadoop/data_

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_311700143.csv 510
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_316002787.csv 511
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_319683974.csv 512
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_319808928.csv 513
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_324650917.csv 514
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_327265591.csv 515
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_33340928.csv 516
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_334937108.csv 517
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_335065423.csv 518
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_335659855.csv 519
user/hadoop/data_anal

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4656244394.csv 594
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4660692110.csv 595
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4703447.csv 596
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_47059911.csv 597
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4711881539.csv 598
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4711881578.csv 599
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4729832494.csv 600
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4735553384.csv 601
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4735553433.csv 602
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_4735554097.csv 603
user/hadoop/dat

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5233682564.csv 679
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5237338348.csv 680
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5237731531.csv 681
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5239656959.csv 682
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5253495070.csv 683
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5253495253.csv 684
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5256842286.csv 685
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5263662343.csv 686
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_52695115.csv 687
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_5278809072.csv 688
user/hadoop/

user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_88514434.csv 764
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_88514461.csv 765
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_88514467.csv 766
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_8929398.csv 767
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_895935.csv 768
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_89672089.csv 769
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_89695338.csv 770
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_90006875.csv 771
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_9003684.csv 772
user/hadoop/data_analysis/coupang_data/chair (category = 184634)/ output_review_9049682.csv 773
user/hadoop/data_analysis/coupang_d

In [3]:
# split_inner_attribute
def innerTocol(df, attr):
    """ df : detail이라는 column을 가진 데이터프레임 (type : dataframe)
        attr : detail에서 찾아올 내적 특성, column명이 됨 (type : str)
        반환값은 없으며 함수 내부에서 df에 새로운 column이 추가됨 """
    
    # 추출한 내적특성의 값을 담을 list
    inner = []
    
    # 내적 특성 값 추출
    for s in df.attribute_list:
        if type(s) != str:
            inner.append('NULL')
        elif s.find(attr) != -1:
            #print(s[s.find(":",s.find(attr))+2:s.find("'",s.find(attr))])
            inner.append(s[s.find(":",s.find(attr))+2:s.find("'",s.find(attr))])
        else:
            inner.append('NULL')
    
    # 데이터프레임에 새로운 열 추가
    df[attr] = inner

# 상품별 내적특성 설정
attr_list = [['인테리어', '재질', '사이즈', '설치', '색상', '매트리스포함', '프레임'],
             ['사이즈', '색상', '형태', '인테리어', '재질', '접이'],
             ['사이즈', '색상', '소재', '재질', '팔걸이', '등받이', '종류', '인테리어'],
             ['사이즈', '색상', '재질', '인테리어'],
             ['사이즈', '색상', '재질', '인테리어'],
             ['사이즈', '색상', '재질', '인테리어', '스툴', '거울'],
             ['사이즈', '색상', '재질', '인테리어', '종류', '타입', '거울'],
             ['색상', '재질', '인테리어', '논슬립', '묶음', '종류', '회전'],
             ['용량', '개폐방식', '재질', '투명', '접이', '바퀴', '손잡이'],
             ['사이즈', '색상', '재질', '형태', '바퀴'],
             ['사이즈', '색상', '발통', '매트/토퍼', '두께', '겉감재질', '충전재'],
             ['색상', '재질', '접이', '바퀴', '사용인원'],
             ['색상', '재질', '인테리어'],
             ['사이즈', '색상', '재질', '소재', '인테리어', '방식', '각도조절'],
             ['사이즈', '잠금방식', '중량', '경보기', '지문인식'],
             ['단', '재질', '형태'],
             ['사이즈', '색상', '재질', '인테리어'],
             ['사이즈', '색상', '재질', '소재', '스툴', '사용인원', '설치', '인테리어'],
             ['사이즈', '색상', '재질', '인테리어', '바퀴', '단'],
             ['사이즈', '색상', '재질', '인테리어', '가로길이', '형태']]

item_list = ['bed', 'bedtray', 'chair', 'closet', 'drawers', 'dressingtable,console',
            'hanger', 'hanger,doorhook', 'livingbox', 'livingroom table', 'mattress',
            'outdoor furniture', 'partition', 'recliner', 'safe', 'shoes shelf', 'smalltable',
            'sofa', 'storage', 'table']

inner_attr = {}
for i in range(20):
    inner_attr[item_list[i]] = attr_list[i]
    
# 내적특성을 열로 추가
path_dir = 'user/hadoop/data_analysis/coupang_data/'
dir_list = get_dir(path_dir)

for d in dir_list:
    base = path_dir + d + '/'
    file_name = 'result.csv'
    item = d[:d.find('(')-1]
    data = open_pd_csv(base + file_name)
    attrs = inner_attr[item]
    for a in attrs:
        innerTocol(data, a)
    
    # 필요없는 열 삭제
    data = data.drop('attribute_list',axis=1)
    data = data.drop('baby_product_link',axis=1)
    data = data.drop('recommends_list',axis=1)
    
    # 색상 열에서 "계열"문자열 모두 삭제
    if '색상' in data.columns:
        data["색상"] = data["색상"].str.replace("계열","")
    data = data.rename(columns = {"count":"reviews_for_last1year"})
        
    save_pd_csv(data ,'user/hadoop/data_analysis/merged_data/'+ item + '_result.csv')

In [4]:
# 판매량 산정 후 결과 저장
basic_folder = 'user/hadoop/data_analysis/merged_data/'
file_list = get_dir(basic_folder)

for file in file_list:
    file_name = basic_folder + file

    # 스파크 dataframe 생성(csv파일 읽기)
    df = spark.read.csv('hdfs://192.168.0.9:9000/' + file_name, header=True)
    df.show()
    
    # 스키마 정의
    intCols = ['rank','category','price', 'discount_percentage', 'rating_total_count',
                  'reviews_for_last1year']
    doubleCols = ['rating']
    boolCols = ['rocket_delivery', 'is_out_of_stock']
        
    for c in df.columns:
        type_col = StringType()
        if c in intCols:
            type_col = IntegerType()
        elif c in doubleCols:
            type_col = DoubleType()
        elif c in boolCols:
            type_col = BooleanType()
        
        df = df.withColumn(c,df[c].cast(type_col))
    df = df.fillna(0)
    
    # 카테고리 출력
    category = file[:file.find('_')]
    print('카테고리 : ',category)

    # 판매량 산정
    temp = df.select('reviews_for_last1year').sort(asc('reviews_for_last1year'))
    temp = df.select('reviews_for_last1year').filter(df.reviews_for_last1year > 0).\
    sort(asc('reviews_for_last1year')).collect()[0:10]
    
    std = 0
    for r in temp :
        std += r['reviews_for_last1year']
    std = std/10

    df = df.withColumn('sales', (1080 - df.rank)*std + (df.reviews_for_last1year)*3)
    
    # 결과 저장(parquet로 저장)
    file_path = 'hdfs://192.168.0.9:9000/' + 'user/hadoop/data_analysis/result2/' + category + '.parquet'
    df.coalesce(1).write.mode("overwrite").option("header","true").format("parquet").save(file_path)

+----+--------+--------------------+------+----------+--------+-------------------+------+------------------+---------------+----------+------------+---------------------+------+----+--------------------+----+----+------+----+
|rank|category|                name| price|product_id|isRocket|discount_percentage|rating|rating_total_count|is_out_of_stock|brand_name|shopping_fee|reviews_for_last1year|  인테리어|  재질|                 사이즈|  설치|  색상|매트리스포함| 프레임|
+----+--------+--------------------+------+----------+--------+-------------------+------+------------------+---------------+----------+------------+---------------------+------+----+--------------------+----+----+------+----+
|   1|  184562|라꾸라꾸 수납침대 4탄 접이식침...|149800|  10755024|    true|                  0|   4.5|              1436|          False|      라꾸라꾸|           0|                402.0|  NULL|NULL|                  싱글|NULL|NULL|  NULL|NULL|
|   2|  184562|북유럽 소나무 침대 받침대 프레...| 39800|4899755280|   false|                 33|   4.5|  

In [5]:
# 카테고리별 인기 요소 추출
basic_folder = 'user/hadoop/data_analysis/result2/'
file_list = get_dir(basic_folder)

for file in file_list:
    
    if '.csv' in file:
        continue
    
    # 해당하는 카테고리의 데이터프레임 가져오기
    file_name = basic_folder + file
    # df = spark.read.csv('hdfs://192.168.0.9:9000/' + file_name, header=True)
    df = spark.read.parquet('hdfs://192.168.0.9:9000/' + file_name)
    df = ad_schema(df)
    
    #카테고리 출력
    item = file[:file.find('.')]
    print('카테고리 : ',item)

    # 상품별 인기 요소
    keywords = []
    
    # 특성별 판매량 순위
    for col in df.columns[13:]:
        #print('특성 :', col)
        result = df.groupby(col).agg(f.count(col).alias("개수"),
                                     f.sum('sales').alias('판매량')).sort(desc('판매량'))
        result = result.withColumn('상품당 판매량',(result.판매량/result.개수)).sort(desc('상품당 판매량'))
        result = result.filter((f.col('개수') > 9) & (f.col(col) != 'NULL'))
        
        if result.count() != 0:
            key = str(result.select(col).take(1)[0])
            key = key[key.find("=")+2:key.find(")")-1]
            keywords.append(col+ ' : ' +key)
        
        #result.show()
    print("인기 요소 :", keywords)
    print('-------------------------------------------------------------------------------------------------------------------')

카테고리 :  bed
인기 요소 : ['인테리어 : 모던/심플', '재질 : 합성목재', '사이즈 : 싱글', '설치 : 방문설치', '색상 : 그레이', '매트리스포함 : 매트리스포함', '프레임 : 밀폐형']
-------------------------------------------------------------------------------------------------------------------
카테고리 :  bedtray
인기 요소 : ['색상 : 브라운', '형태 : 직사각형', '인테리어 : 모던/심플', '재질 : 원목', '접이 : 접이식가능']
-------------------------------------------------------------------------------------------------------------------
카테고리 :  chair
인기 요소 : ['색상 : 베이지', '소재 : 플라스틱', '재질 : 철제', '팔걸이 : 팔걸이없음', '등받이 : 등받이없음', '종류 : 접이식의자', '인테리어 : 모던/심플']
-------------------------------------------------------------------------------------------------------------------
카테고리 :  closet
인기 요소 : ['색상 : 그레이', '재질 : PB + MDF', '인테리어 : 내추럴']
-------------------------------------------------------------------------------------------------------------------
카테고리 :  drawers
인기 요소 : ['색상 : 화이트', '재질 : 철제', '인테리어 : 내추럴']
------------------------------------------------------------------------------