In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.conf import SparkConf
from pyspark.sql.types import StructType, StringType, ArrayType, IntegerType, StructField, LongType
from pyspark.sql.functions import concat, split, col, when, lit, count,collect_list, filter
from pyspark.sql.functions import regexp_replace,regexp_extract, udf, explode, struct, slice
import datetime

sc = SparkSession.builder.appName("oliiveSpark").getOrCreate()

In [3]:
schema = StructType().add("id", StringType())\
.add("comment", StringType())

In [4]:
comment = sc.read.schema(schema).csv("review.csv")

In [5]:
comment = comment.withColumn("revised_comment", regexp_replace("comment", "[^A-Za-z0-9가-힣]", ' ')).drop('comment')

In [6]:
sq = SQLContext(sc)

In [7]:
comment.createOrReplaceTempView("comment")

merged = sq.sql("SELECT id, concat_ws('', collect_list(revised_comment)) as comment FROM comment group by id")

In [8]:
tokenized = merged.withColumn("comment", split("comment", " "))

In [9]:
tokenized.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [10]:
token_count = tokenized.select("id", explode("comment").alias("token"))\
.groupBy("id", "token")\
.count()\
.groupBy("id")\
.agg(collect_list(struct(col("token"), col("count"))).alias("text"))

In [11]:
sorted_token = token_count.sort(token_count.text.count.desc())

In [10]:
sorted_token.show()

                                                                                

+-------------+-------------------------+
|           id|                     text|
+-------------+-------------------------+
|A000000166486|   [{, 15903}, {독도, ...|
|A000000163730|   [{, 9765}, {핵심, 3...|
|A000000149135| [{, 9698}, {라운드랩,...|
|A000000166105|   [{, 9395}, {한달, 4...|
|A000000149838|  [{, 9289}, {피부가, ...|
|A000000145861|   [{, 8999}, {타입, 1...|
|A000000163729|[{, 8642}, {디렉터파이...|
|A000000166617|   [{, 8457}, {N통째, ...|
|A000000156871|  [{, 8345}, {없으면, ...|
|A000000167188|    [{, 8345}, {AHC의,...|
|A000000156842| [{, 8161}, {속건조를,...|
|A000000146635|   [{, 7045}, {본인, 8...|
|A000000165945| [{, 6921}, {수분크림,...|
|A000000167164| [{, 6875}, {피부타입,...|
|A000000167141| [{, 6783}, {솔직후기,...|
|A000000138762|  [{, 6229}, {코시국, ...|
|A000000164212| [{, 6211}, {피지오겔,...|
|A000000166586|[{, 5983}, {수분감진짜...|
|A000000149268|   [{, 5951}, {구매, 1...|
|A000000158696|   [{, 5947}, {본인, 6...|
+-------------+-------------------------+
only showing top 20 rows



In [11]:
token_count.printSchema()

root
 |-- id: string (nullable = true)
 |-- text: array (nullable = false)
 |    |-- element: struct (containsNull = false)
 |    |    |-- token: string (nullable = true)
 |    |    |-- count: long (nullable = false)



In [16]:
olive = sc.read.json("olive_20220612.json")

                                                                                

In [31]:
def def_sort(x) : 
    return sorted(x, key=lambda x :x[1], reverse = True)

udf_sort = udf(def_sort, ArrayType(StructType([StructField("token", StringType()),StructField("count",LongType())])))

sorted_t = token_count.select("id", udf_sort(col("text")).alias("text"))

In [30]:
sorted_t.show()

[Stage 14:>                                                         (0 + 1) / 1]

+----------------------------+-------------+
|                        text|           id|
+----------------------------+-------------+
|[{남자친구에게, 1}, {선물...|A000000163048|
|       [{, 1697}, {잘, 14...|A000000150323|
|       [{, 2966}, {잘, 26...|A000000151914|
|       [{, 2832}, {잘, 24...|A000000148170|
|       [{, 3144}, {잘, 29...|A000000166499|
|       [{, 1655}, {잘, 15...|A000000010476|
|       [{, 3237}, {잘, 26...|A000000148023|
|      [{, 245}, {너무, 13...|A000000163762|
|       [{, 9063}, {잘, 60...|A000000166296|
|     [{, 5778}, {선크림, ...|A000000165474|
|       [{, 4084}, {잘, 27...|A000000162571|
|       [{, 12321}, {잘, 7...|A000000164916|
|       [{, 14}, {것, 4}, ...|A000000165476|
|       [{, 6783}, {잘, 39...|A000000167141|
|     [{, 2246}, {아벤느, ...|A000000104007|
|     [{, 2364}, {아벤느, ...|A000000160792|
|      [{, 4771}, {너무, 2...|A000000129008|
|       [{, 1158}, {잘, 92...|A000000156683|
|       [{, 3378}, {이, 19...|A000000002839|
|      [{, 2952}, {너무, 2...|A000000112805|
+

                                                                                

In [43]:
option = ["저는","제품", "선물을", "이건", "저의", "저희","요즘", "본인", "원래", "했습니다", "이다", "사용", "후기", "이거", "좀","잘", "너무", "것", "같아요", "이", "가", "은", "는", "더", "많이", "쓰고"]
stop_words = olive.rdd.flatMap(lambda x : [x.brand, x.big_category, x.small_category]).collect()
stop_words.extend(option)

is_none = lambda x : x != ''
is_stop = lambda x : ~(x.isin(stop_words))


res = sorted_t\
.withColumn("removed", filter(col("text.token"), is_none))\
.withColumn("token", filter(col("removed"), is_stop))\
.withColumn("token", slice(col("token"), 1, 10))\
.drop("text")\
.drop("removed")

In [44]:
joined = res.join(olive, res.id == olive.id)
joined = joined.dropDuplicates(['id'])
final = joined.select(['name','comment.id', 'token'])

In [45]:
final.show()



+------------------------------------+-------------+------------------------------+
|                                name|           id|                         token|
+------------------------------------+-------------+------------------------------+
| 에비앙 브뤼미자뙤르 페이셜미스트...|A000000001588|[미스트, 좋아요, 작아서, 좋...|
|      바이오더마 센시비오 토너 250ml|A000000002839|[피부가, 토너, 좋아요, 좋은...|
|   바이오더마 하이드라비오 토너 2...|A000000004434|[좋아요, 토너, 정말, 피부가...|
|    유세린 더모 퓨리파이어 토너 2...|A000000005766|[피부에, 좋아요, 사용하고, ...|
| 유세린 더모퓨리파이어 오일컨트롤...|A000000005767|[좋아요, 피부에, 피부, 피부...|
|  라로슈포제 유비데아 XL 멜트인 크림|A000000010462|[선크림, 좋아요, 다른, 정말...|
|   라로슈포제 에빠끌라 H 리밸런싱...|A000000010474|[좋아요, 피부가, 토너, 피부...|
|라로슈포제 에빠끌라 퓨리파잉 포밍 젤|A000000010476|[좋아요, 피부가, 거품이, 다...|
|  라로슈포제 똘러리앙 퓨리파잉 포...|A000000010482|[좀, 좋아요, 피부가, 거품이...|
|     아벤느 오 떼르말 더블 기획 (...|A000000010507|[미스트, 좋아요, 피부가, 때...|
|   아벤느 클리낭스 매트 매티파잉 ...|A000000011268|[피부에, 좋아요, 토너, 피부...|
|     피지오겔 DMT 데일리보습 로션...|A000000012809|[로션, 피부가, 좋아요, 정말...|
|    유리아쥬 오 떼르말 더블



In [46]:
final.write.json("analysls")

                                                                                