# 과제 - 2주차

## 1. Dataframe, RDD API사용해보기  (Week2 Jupyter Notebook)

### (1) RDD

#### SparkSession

In [1]:
from pyspark.sql import SparkSession

# SparkSession
spark = (
    SparkSession.builder
        .appName("rdd-dataframe")
        .master("local")
        .getOrCreate()
)

# SparkContext를 SparkSession에서 빼두기
sc = spark.sparkContext

# 하단 메시지는 Jupyter 공식이미지에서 나오는 메시지로, 무시하기

24/08/23 02:45:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
# SparkContext 멈추기
sc.stop()

#### Range(0,10) & Collect()

In [10]:
# collect()를 통해 파이썬 데이터셋으로 변경해야 사용할 수 있음
rdd2 = sc.range(0, 10)

print(rdd2)

PythonRDD[15] at RDD at PythonRDD.scala:53


In [11]:
print(rdd2.collect())

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


#### parallelize로 변환[Python list → RDD] & take() [head()와 동일]

In [21]:
# parallelize로 RDD로 만든 후 take로 위에서부터 3개 가져옴 (head/top/first와 같음)
data = ["one", "two", "three"]
rdd3 = sc.parallelize(data)

rdd3.take(3)

['one', 'two', 'three']

#### textFile()
* sc.textFile("../data/movie.csv")만 단독 실행시 완료로 떠서 작업이 된 줄 알았는데, 다른 블럭에서 .count()를 실행하니 그제서야 그런 파일은 없다고 뜸. `action이 있어야 데이터 처리가 이루어지는 Spark의 lazy한 특성을 확인함`

In [23]:
rdd1 = sc.textFile("../data/movies.csv")

cnt = rdd1.count()
print(cnt)

9743


In [24]:
# 첫번째 데이터는 컬럼명
arr2 = rdd1.take(10)
print(arr2)

['movieId,title,genres', '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy', '2,Jumanji (1995),Adventure|Children|Fantasy', '3,Grumpier Old Men (1995),Comedy|Romance', '4,Waiting to Exhale (1995),Comedy|Drama|Romance', '5,Father of the Bride Part II (1995),Comedy', '6,Heat (1995),Action|Crime|Thriller', '7,Sabrina (1995),Comedy|Romance', '8,Tom and Huck (1995),Adventure|Children', '9,Sudden Death (1995),Action']


#### takeOrdered(5)
* Sorting을 해서 가져오므로, 데이터가 큰 경우에는 사용X
* RDD는 컬럼이 없으므로 컬럼기준으로 Sorting은 불가함

In [25]:
arr3 = rdd1.takeOrdered(5)
print(arr3)

['1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy', '10,GoldenEye (1995),Action|Adventure|Thriller', '100,City Hall (1996),Drama|Thriller', '100044,Human Planet (2011),Documentary', '100068,Comme un chef (2012),Comedy']


#### aggregate
* 공식문서 코드로 이해하기
  * 문서링크 : https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.aggregate.html
* 샘플코드
```python
seqOp = (lambda x, y: (x[0] + y, x[1] + 1))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))
sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)
>>> (10, 4)
```
  * seqOp로 각 파티션 계산 [주어진 항등원 (0,0)부터 시작]
    * (0, 0) → (1, 1) → (3, 2)
    * (0, 0) → (3, 1) → (7, 2)
  * combOp로 각 파티션 결과 취합
    * (3, 2) + (7, 2) = (10, 4)

In [32]:
rdd2 = sc.range(0, 1000, 1, 10)

seqOp = lambda v1, v2: v1 + v2
combOp = lambda v1, v2: v1 + v2

rdd2.aggregate(0, seqOp, combOp)

499500

* 항등원을 바꿔보니 항등원*11로 보여서 파티션의 수를 확인해보니 10개인 것을 확인함
* 파티션의 갯수는 공식문서를 확인해보니 range에서 지정한 부분이었음
  * 문서링크 : https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.SparkContext.range.html
  * `sc.range(start, end, step, number of partition)`

In [34]:
rdd2.aggregate(9, seqOp, combOp)

499599

In [35]:
rdd2.aggregate(50, seqOp, combOp)

500050

In [36]:
print(rdd2.getNumPartitions())

10


* reduce, fold

In [38]:
rdd2.reduce(lambda v1, v2: v1 + v2)

499500

In [39]:
rdd2.fold(0, lambda v1, v2: v1 + v2)

499500

#### foreach

In [40]:
data = ["one", "two", "three"]
rdd3 = sc.parallelize(data)

rdd3.foreach(lambda v: print(v))

one
two
three


#### repartition
* 실습환경은 로컬이므로 파티션 1개인 상태여서 중요하지 않았음
* 업무환경에서는 많이 쓰이는 함수임

In [2]:
# 파티션 2개로 나눔
rdd1 = sc.textFile("../data/movies.csv")

rdd4 = rdd1.repartition(2)

#### foreachPartition
* return value는 파티션별 이터레이터
* 파티션 별로 다른 함수를 호출해야하는 경우나, progress체크 등에 사용

In [3]:
rdd4.foreachPartition(lambda it: print(it))

<itertools.chain object at 0x7f789b7bb100>                          (0 + 1) / 1]
<itertools.chain object at 0x7f789b7bb9a0>
                                                                                

In [4]:
# 파티션 항목 확인

# 전체 확인 코드
#rdd4.foreachPartition(lambda it: [print(x) for x in it])

# 상위 5개만 확인
rdd4.foreachPartition(lambda it: [print(x) for i, x in enumerate(it) if i < 5])

movieId,title,genres
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
10,GoldenEye (1995),Action|Adventure|Thriller
11,"American President, The (1995)",Comedy|Drama|Romance
12,Dracula: Dead and Loving It (1995),Comedy|Horror
13,Balto (1995),Adventure|Animation|Children
14,Nixon (1995),Drama


In [5]:
# 어떤 파티션의 데이터인지까지 같이 출력 by ChatGPT
rdd4.mapPartitionsWithIndex(
    lambda partition_index, it: [(partition_index, x) for i, x in enumerate(it) if i < 5]
).foreach(lambda x: print(f"Partition: {x[0]}, Value: {x[1]}"))

Partition: 0, Value: movieId,title,genres
Partition: 0, Value: 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
Partition: 0, Value: 2,Jumanji (1995),Adventure|Children|Fantasy
Partition: 0, Value: 3,Grumpier Old Men (1995),Comedy|Romance
Partition: 0, Value: 4,Waiting to Exhale (1995),Comedy|Drama|Romance
Partition: 1, Value: 10,GoldenEye (1995),Action|Adventure|Thriller
Partition: 1, Value: 11,"American President, The (1995)",Comedy|Drama|Romance
Partition: 1, Value: 12,Dracula: Dead and Loving It (1995),Comedy|Horror
Partition: 1, Value: 13,Balto (1995),Adventure|Animation|Children
Partition: 1, Value: 14,Nixon (1995),Drama


#### countByValue()
* 값의 갯수를 계산

In [8]:
rdd1.countByValue()

defaultdict(int,
            {'movieId,title,genres': 1,
             '1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy': 1,
             '2,Jumanji (1995),Adventure|Children|Fantasy': 1,
             '3,Grumpier Old Men (1995),Comedy|Romance': 1,
             '4,Waiting to Exhale (1995),Comedy|Drama|Romance': 1,
             '5,Father of the Bride Part II (1995),Comedy': 1,
             '6,Heat (1995),Action|Crime|Thriller': 1,
             '7,Sabrina (1995),Comedy|Romance': 1,
             '8,Tom and Huck (1995),Adventure|Children': 1,
             '9,Sudden Death (1995),Action': 1,
             '10,GoldenEye (1995),Action|Adventure|Thriller': 1,
             '11,"American President, The (1995)",Comedy|Drama|Romance': 1,
             '12,Dracula: Dead and Loving It (1995),Comedy|Horror': 1,
             '13,Balto (1995),Adventure|Animation|Children': 1,
             '14,Nixon (1995),Drama': 1,
             '15,Cutthroat Island (1995),Action|Adventure|Romance': 1,
      

#### countApproxDistinct()
* Spark내부의 Approx알고리즘이 Count해줌
  * 데이터가 작을수록 알고리즘 효율이 떨어짐
* 데이터가 큰 경우 Count만으로도 오래걸릴 수 있음
  * Accumulator를 활용하는게 더 빠를 수 있음 (아래는 이해만을 위한 틀린 코드)
    ```python
    rdd4.foreachPartition(
        accumulator ac
        ac.add(1)
        )
    ```

In [12]:
# Approx
rdd1.countApproxDistinct()

10275

In [13]:
# Actual
len(rdd1.distinct().collect())

9743

#### Cache

* 중간에 캐싱을 해 둠
* 캐싱을 해두었다면 메모리가 터졌을 때, 좀 더 빠르게 계산될 수 있음 (Optimization도움)
* 캐싱할 때마다, Spark내부적으로 메타데이터 연산(Statistics)을 조금씩 해줌 (Optimization도움)
* 데이터가 크다면 한번의 작업이 끝날때마다 캐싱을 해두는 것을 추천
  * 아래와 같이 작업마다 캐싱
  ```python
  rdd2 = rdd1.filter(...).where().select()
  rdd2.cache()
  ```

In [21]:
# cache
rdd2.cache()

PythonRDD[34] at RDD at PythonRDD.scala:53

#### CheckPoint

* 캐싱도 중간값을 저장해서 비슷하지만, CheckPoint는 명시적으로 어디에 저장할지 정의해야 함
* 테스트해 본 결과, 아래와 같이 생성되었음
  * \metacode_de-2024\data\temp\checkpoint\2024-08-23 072646.217657\8f6fcbb9-d532-4e6e-a928-39b809364967

In [23]:
from datetime import datetime

sc.setCheckpointDir("../data/temp/checkpoint/" + str(datetime.now()))

rdd2.checkpoint()

#### Persist
  * 어떤 StorageLevel에 쓸지 정의 가능 (Default는 메모리)
    * 무거운 연산을 해야한다면 일부러 디스크를 내릴 때 쓰기도 함
        ```python
        from pyspark import StorageLevel
        rdd2.persist(StorageLevel.DISK_ONLY)
        ```
    * 메모리에 올라가지 않아서 아예 멈추는 상황도 생길 수 있어 사용(어떻게든 계산을 시키기 위함)

In [20]:
from pyspark import StorageLevel
rdd2.persist(StorageLevel.MEMORY_ONLY)

print(rdd2.getStorageLevel())

Memory Serialized 1x Replicated


#### 파티션 관련

* getNumPartitions() : 파티션의 수

In [28]:
rdd1.getNumPartitions()

3

* partitioner : 파티션을 어떻게 나눌지에 대한 알고리즘이 들어있음
  * 기본값은 Hash

In [27]:
print(rdd1.partitioner)

None


* repartition() : 파티션의 수 지정
  * 기본 알고리즘인 Hashing이 사용되었을 것임

In [26]:
rdd1 = rdd1.repartition(3)

print(rdd1.getNumPartitions())

3


* coalesce() : 파티션 수 줄이기
  * **repartition()으로 줄이는 것보다 효율적**
    * repartition으로 줄이면 Hashing, DiskI/O 등으로 Shuffle이 발생할 수 밖에 없지만
    * coalesce는 Hashing을 다시 하지 않도록, 최소한의 이동이 일어나도록 함

In [32]:
rdd1 = rdd1.coalesce(2)

print(rdd1.getNumPartitions())

2


#### 기타

In [14]:
rdd1.isEmpty()

False

In [15]:
data = []
rdd5 = sc.parallelize(data)

rdd5.isEmpty()

True

In [17]:
# Max, Min
rdd2 = sc.range(0, 1000, 1, 10)

print(rdd2.max())
print(rdd2.min())

999
0


In [18]:
# Meta
rdd1 = sc.textFile("../data/movies.csv")

print(rdd1.id) 
print(rdd1.context) # 어떤 SparkContext를 사용중인지
print(rdd1.toDebugString)

<bound method RDD.id of ../data/movies.csv MapPartitionsRDD[33] at textFile at NativeMethodAccessorImpl.java:0>
<SparkContext master=local appName=rdd-dataframe>
<bound method RDD.toDebugString of ../data/movies.csv MapPartitionsRDD[33] at textFile at NativeMethodAccessorImpl.java:0>


### (2) RDD Transformation

In [33]:
# 실습을 위한 기초코드
data = ["co1,tcol2,tA_B_C", "col,tcol3,tD_E_F"]
rdd1 = sc.parallelize(data)

rdd1.take(10)

['co1,tcol2,tA_B_C', 'col,tcol3,tD_E_F']

In [34]:
# map
rdd2 = rdd1.map(lambda v: v.upper())
rdd2.take(10)

['CO1,TCOL2,TA_B_C', 'COL,TCOL3,TD_E_F']

In [38]:
# map
rdd5 = rdd1.map(lambda v: v.split(','))
rdd5.take(10)

[['co1', 'tcol2', 'tA_B_C'], ['col', 'tcol3', 'tD_E_F']]

In [35]:
# mapPartitions
rdd3 = rdd1.mapPartitions(lambda it: map(lambda v: v.upper(), it))
rdd3.take(10)

['CO1,TCOL2,TA_B_C', 'COL,TCOL3,TD_E_F']

In [37]:
# mapPartitionsWithIndex (mapPartitions + Index)
rdd4 = rdd1.mapPartitionsWithIndex(lambda idx, it: map(lambda v: f"idx:{idx}, value:{v}", it))
rdd4.take(10)

['idx:0, value:co1,tcol2,tA_B_C', 'idx:0, value:col,tcol3,tD_E_F']

In [39]:
# flatMap
rdd6 = rdd1.flatMap(lambda v: v.split(","))
rdd6.take(10)

['co1', 'tcol2', 'tA_B_C', 'col', 'tcol3', 'tD_E_F']

In [40]:
# distinct
rdd8 = sc.parallelize([1, 2, 3, 3, 5, 6, 8, 8, 10]).distinct()
rdd8.take(10)

[1, 2, 3, 5, 6, 8, 10]

In [41]:
# subtract (겹치는 것을 제거)
r1 = sc.parallelize([1, 2, 3, 4, 5])
r2 = sc.parallelize([4, 5, 6, 7, 8])
rdd10 = r1.subtract(r2)
rdd10.take(10)

[2, 1, 3]

In [62]:
# glom
rdd = sc.range(0, 50, 1, 5)
rdd.glom().collect()
rdd.take(10)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

### (3) RDD with key-value pairs

In [63]:
# 실습을 위한 기초코드
data = ["a", "b", "c", "b", "b", "d"]
rdd1 = sc.parallelize(data)

rdd2 = rdd1.map(lambda v: (v, 1))
rdd2.take(10)

[('a', 1), ('b', 1), ('c', 1), ('b', 1), ('b', 1), ('d', 1)]

In [64]:
# mapValues : rdd1은 key-value pair가 아님 [mapValues는 key-value pair를 위한 API이므로 오류발생]
rdd1.mapValues(lambda v: v + 1).take(10) # error

24/08/23 12:21:30 ERROR Executor: Exception in task 0.0 in stage 62.0 (TID 113)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2276, in <lambda>
    map_values_fn = lambda kv: (kv[0], f(kv[1]))
IndexError: string index out of range

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonEx

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 62.0 failed 1 times, most recent failure: Lost task 0.0 in stage 62.0 (TID 113) (7b9e2a0ad7ba executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2276, in <lambda>
    map_values_fn = lambda kv: (kv[0], f(kv[1]))
IndexError: string index out of range

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at jdk.internal.reflect.GeneratedMethodAccessor77.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 259, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/spark/python/pyspark/rdd.py", line 1560, in takeUpToNumLeft
    yield next(iterator)
  File "/usr/local/spark/python/lib/pyspark.zip/pyspark/util.py", line 73, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/spark/python/pyspark/rdd.py", line 2276, in <lambda>
    map_values_fn = lambda kv: (kv[0], f(kv[1]))
IndexError: string index out of range

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2236)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [65]:
# mapValues : rdd2는 key-value pair가 맞음
rdd3 = rdd2.mapValues(lambda v: v + 1)
rdd3.take(10)

[('a', 2), ('b', 2), ('c', 2), ('b', 2), ('b', 2), ('d', 2)]

In [68]:
# flatMapValues
data = [("a", "1,2,3"), ("b", "4,5,6")]
rdd = sc.parallelize(data)
rdd.take(10)

[('a', '1,2,3'), ('b', '4,5,6')]

In [69]:
rdd4 = rdd.flatMapValues(lambda v: v.split(","))
rdd4.take(10)

[('a', '1'), ('a', '2'), ('a', '3'), ('b', '4'), ('b', '5'), ('b', '6')]

In [70]:
# reduceByKey

# rdd2 = [('a', 1), ('b', 1), ('c', 1), ('b', 1), ('b', 1), ('d', 1)]
rdd6 = rdd2.reduceByKey(lambda a, b: a + b)
rdd6.take(10)

[('a', 1), ('b', 3), ('c', 1), ('d', 1)]

In [71]:
# groupByKey
rdd7 = rdd6.groupByKey()
rdd7.map(lambda x : (x[0], list(x[1]))).collect()

[('a', [1]), ('b', [3]), ('c', [1]), ('d', [1])]

In [None]:
# cogroup

In [74]:
## group1 : rdd8
kv1 = [("k1", "v1"), ("k2", "v2"), ("k3", "v3")]
rdd8 = sc.parallelize(kv1)
rdd8.take(10)

[('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3')]

In [75]:
#3 group2 : rdd9
kv2 = [("k1", "v4"), ("k2", "v5"), ("k3", "v6")]
rdd9 = sc.parallelize(kv2)
rdd9.take(10)

[('k1', 'v4'), ('k2', 'v5'), ('k3', 'v6')]

In [76]:
rdd10 = rdd8.cogroup(rdd9)
[(x, tuple(map(list, y))) for x, y in sorted(list(rdd10.collect()))]

[('k1', (['v1'], ['v4'])), ('k2', (['v2'], ['v5'])), ('k3', (['v3'], ['v6']))]

In [77]:
# join  실제로는 cogroup보다는 join을 많이 사용
rdd11 = rdd8.join(rdd9)
rdd11.take(10)

[('k2', ('v2', 'v5')), ('k1', ('v1', 'v4')), ('k3', ('v3', 'v6'))]

In [78]:
# combineByKey
r1 = sc.parallelize([("Math", 100), ("Eng", 80), ("Math", 50), ("Eng", 70), ("Eng", 90)])

def to_list(a):
    return [a]

def append(a, b):
    a.append(b)
    return a

def extend(a, b):
    a.extend(b)
    return a

rdd12 = r1.combineByKey(to_list, append, extend)
rdd12.take(10)

[('Math', [100, 50]), ('Eng', [80, 70, 90])]

### (4) DataFrame

#### Row
* 보통은 데이터를 읽어오지만, Row를 활용해 데이터를 정의해줄 수도 있음

In [79]:
from pyspark.sql import Row

r1 = Row(1, "two", True)
r1

<Row(1, 'two', True)>

In [80]:
print(r1[0])
print(r1[1])
print(r1[2])

1
two
True


#### DataFrame
* 파이썬 list로 바로 DataFrame을 생성할 수는 없으며, Row 등으로 감싸준 경우는 가능
* 컬럼명 미지정시 _1, _2와 같이 생성됨

In [None]:
# DataFrame생성 불가능 예시 (TypeError 발생)
data1 = [1, 2, 3]

df = spark.createDataFrame(data1)
df.show()

TypeError: Can not infer schema for type: <class 'int'>

In [83]:
# DataFrame생성 가능 예시 (컬럼 미지정)
from pyspark.sql import Row

data2 = [Row(1), Row(2), Row(3)]

df = spark.createDataFrame(data2)
df.show()

+---+
| _1|
+---+
|  1|
|  2|
|  3|
+---+



In [84]:
# DataFrame생성 가능 예시 (컬럼 지정)
df = spark.createDataFrame(data2, ['num'])
df.show()

+---+
|num|
+---+
|  1|
|  2|
|  3|
+---+



In [85]:
# DataFrame생성 가능 예시 (Class를 활용)
class Person:
    def __init__(self):
        self.name = "name"
        self.age = 20
        self.job = "student"
    def __init__(self, name, age, job):
        self.name = name
        self.age = age
        self.job = job

p1 = Person("foo", 30, "programmer")
p2 = Person("bar", 10, "student")

spark.createDataFrame([p1, p2]).show(3)

+---+----------+----+
|age|       job|name|
+---+----------+----+
| 30|programmer| foo|
| 10|   student| bar|
+---+----------+----+



In [86]:
# DataFrame생성 가능 예시 (Tuple 활용)
t1 = ("foo", 30, "programmer")
t2 = ("bar", 10, "student")

spark.createDataFrame([t1, t2]).show(3)

+---+---+----------+
| _1| _2|        _3|
+---+---+----------+
|foo| 30|programmer|
|bar| 10|   student|
+---+---+----------+



In [87]:
# DataFrame생성 가능 예시 (Tuple 활용+컬럼지정)
t1 = ("foo", 30, "programmer")
t2 = ("bar", 10, "student")

spark.createDataFrame([t1, t2],['name','age','job']).show(3)

+----+---+----------+
|name|age|       job|
+----+---+----------+
| foo| 30|programmer|
| bar| 10|   student|
+----+---+----------+



In [92]:
# DataFrame생성 가능 예시 (스키마 정의)

from pyspark.sql.types import StructType,StructField,StringType,IntegerType

schema = StructType(
    [StructField("name", StringType(), nullable = True),
     StructField("age", IntegerType(), nullable = True),
     StructField("job", StringType(), nullable = True)]
)

rowRDD = sc.parallelize([Row("foo", 7, "programmer"), Row("bar", 13, "student")])
rowRDD.collect()

[<Row('foo', 7, 'programmer')>, <Row('bar', 13, 'student')>]

In [90]:
spark.createDataFrame(rowRDD, schema).show()

+----+---+----------+
|name|age|       job|
+----+---+----------+
| foo|  7|programmer|
| bar| 13|   student|
+----+---+----------+



In [93]:
# SparkSession을 활용해 DataFrame으로 생성 [spark.read.text()]
spark.read.text("../data/movies.csv").show(3)

# 참고 : sc.textFile() : SparkContext를 활용해 RDD로 생성

+--------------------+
|               value|
+--------------------+
|movieId,title,genres|
|1,Toy Story (1995...|
|2,Jumanji (1995),...|
+--------------------+
only showing top 3 rows



In [94]:
# SparkSession을 활용해 DataFrame으로 생성 [read.option()]
spark.read.option("sep", ",").csv("../data/movies.csv").show(3)

+-------+----------------+--------------------+
|    _c0|             _c1|                 _c2|
+-------+----------------+--------------------+
|movieId|           title|              genres|
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 3 rows



In [95]:
# SparkSession을 활용해 DataFrame으로 생성 [read.option(), 여러 데이터 소스]
spark.read.option("basePath", "../data/").csv("../data/*.csv").show(3)

+------+-------+------+---------+
|   _c0|    _c1|   _c2|      _c3|
+------+-------+------+---------+
|userId|movieId|rating|timestamp|
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 3 rows



In [105]:
# Table을 만들고 쿼리해서 사용 (영구적으로 사용할 수 없는 테이블임을 유의)
## buckected_table이란 이름의 table이 없으면(count == 0)
if (spark.sql("show tables").where("tableName = 'buckected_table' ").count() == 0):
    ## Data를 읽고
    df = spark.read.csv("../data/movies.csv")
    ## created_at 기준으로 3개의 bucket으로 나누고 / buckected_table로 테이블 생성
    df.repartition(1).write.mode("Overwrite").bucketBy(3, "id").saveAsTable("buckected_table")
    
    ## 테이블을 읽고 SQL 쿼리 사용
    spark.read.table("buckected_table")
    spark.sql("select * from buckected_table limit 10")    

#### DataFrame API

In [109]:
# .select()를 사용해 쿼리

df1 = spark.read.json("../data/*.json")
df1.show(3)

+--------------------+----------+--------------------+--------------------+------+--------------------+------------------+
|               actor|created_at|                 org|             payload|public|                repo|              type|
+--------------------+----------+--------------------+--------------------+------+--------------------+------------------+
|{"id":35613825,"l...|2024-05-19|{"id":126833237,"...|{"comment":{"url"...|  true|{"id":749408001,"...|CommitCommentEvent|
|{"id":84257236,"l...|2024-05-19|                null|{"comment":{"url"...|  true|{"id":430924902,"...|CommitCommentEvent|
|{"id":41898282,"l...|2024-05-19|{"id":1040002,"lo...|{"comment":{"url"...|  true|{"id":170801983,"...|CommitCommentEvent|
+--------------------+----------+--------------------+--------------------+------+--------------------+------------------+
only showing top 3 rows



In [110]:
df1.printSchema()

root
 |-- actor: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- org: string (nullable = true)
 |-- payload: string (nullable = true)
 |-- public: boolean (nullable = true)
 |-- repo: string (nullable = true)
 |-- type: string (nullable = true)



In [111]:
df1.select("actor", "created_at").show(3)

+--------------------+----------+
|               actor|created_at|
+--------------------+----------+
|{"id":35613825,"l...|2024-05-19|
|{"id":84257236,"l...|2024-05-19|
|{"id":41898282,"l...|2024-05-19|
+--------------------+----------+
only showing top 3 rows



In [116]:
# SparkSQL (Expression) 활용
df1.selectExpr('actor').show(3, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|actor                                                                                                                                                                                                                    |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"id":35613825,"login":"vercel[bot]","display_login":"vercel","gravatar_id":"","url":"https://api.github.com/users/vercel[bot]","avatar_url":"https://avatars.githubusercontent.com/u/35613825?"}                        |
|{"id":84257236,"login":"DM-netizen","display_login":"DM-netizen","gravatar_id":"","url":"https://api.github.com/users/D

In [117]:
# SparkSQL (Expression) 활용
df1.selectExpr('upper(actor) as upperName').show(3, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|upperName                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"ID":35613825,"LOGIN":"VERCEL[BOT]","DISPLAY_LOGIN":"VERCEL","GRAVATAR_ID":"","URL":"HTTPS://API.GITHUB.COM/USERS/VERCEL[BOT]","AVATAR_URL":"HTTPS://AVATARS.GITHUBUSERCONTENT.COM/U/35613825?"}                        |
|{"ID":84257236,"LOGIN":"DM-NETIZEN","DISPLAY_LOGIN":"DM-NETIZEN","GRAVATAR_ID":"","URL":"HTTPS://API.GITHUB.COM/USERS/D

In [113]:
# col을 사용해 쿼리

## pyspark.sql.functions.* 의 모든 타입은 column을 input으로 받는다
## split('id',', ')는 미동작 → split(col('id'),', ')해야 동작
from pyspark.sql.functions import col

c1 = col("actor")
c2 = col("org")

df1.select(c1, c2).show(3)

+--------------------+--------------------+
|               actor|                 org|
+--------------------+--------------------+
|{"id":35613825,"l...|{"id":126833237,"...|
|{"id":84257236,"l...|                null|
|{"id":41898282,"l...|{"id":1040002,"lo...|
+--------------------+--------------------+
only showing top 3 rows



In [126]:
# filter 사용
data2 = [Row(1), Row(2), Row(3)]
df = spark.createDataFrame(data2)

df.filter("_1 == 2").show(3, False)

+---+
|_1 |
+---+
|2  |
+---+



In [127]:
# where 사용
data2 = [Row(1), Row(2), Row(3)]
df = spark.createDataFrame(data2)

df.where("_1 == 2").show(3, False)

+---+
|_1 |
+---+
|2  |
+---+



In [130]:
# orderBy
data2 = [Row(3), Row(2), Row(4)]
df = spark.createDataFrame(data2)

df.orderBy("_1").show(3)

+---+
| _1|
+---+
|  2|
|  3|
|  4|
+---+



In [131]:
# sort
data2 = [Row(3), Row(2), Row(4)]
df = spark.createDataFrame(data2)

df.sort(col("_1")).show(3)

+---+
| _1|
+---+
|  2|
|  3|
|  4|
+---+



In [141]:
# groupBy
import pyspark.sql.functions as F

df = spark.read.json("../data/*.json")

# aggregations
groupedDF = df.groupBy("type")
groupedDF.agg(F.max(F.col("created_at"))).show()

+------------------+---------------+
|              type|max(created_at)|
+------------------+---------------+
|CommitCommentEvent|     2024-05-19|
+------------------+---------------+



In [142]:
# 중복제거
print(df.count())
print(df.distinct().count())

228




228


                                                                                

#### 많이 사용하는 DataFrame API
* withColumn (많이 쓰임)
  * pyspark.sql.functions.*로 transformation작업을 많이함
    * `import pyspark.sql.functions as F` 로 많이 사용
  * 필터링해서 새로운 컬럼에 넣고, 기존 컬럼을 Drop하는 식으로 많이 사용

In [119]:
import pyspark.sql.functions as F

# actor컬럼을 trim해서, newColumn라는 컬럼을 새로 만들어 저장
df.withColumn("newColumn", F.trim(F.col("actor"))).show(3,False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### Column API

In [146]:
# Seㅣect + Col로 DataFrame생성하기
df1 = spark.read.json("../data/*.json")
df1.printSchema()

root
 |-- actor: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- org: string (nullable = true)
 |-- payload: string (nullable = true)
 |-- public: boolean (nullable = true)
 |-- repo: string (nullable = true)
 |-- type: string (nullable = true)



In [147]:
c1 = F.col("actor")
c2 = F.col("org")

df2 = df1.select(c1, c2)
df2.printSchema()

root
 |-- actor: string (nullable = true)
 |-- org: string (nullable = true)



In [151]:
# orderBy
df1.select("org").orderBy("created_at").show(5)

+--------------------+
|                 org|
+--------------------+
|{"id":126833237,"...|
|                null|
|{"id":1040002,"lo...|
|                null|
|                null|
+--------------------+
only showing top 5 rows



In [152]:
# F.split 예제 (데이터 불러오기)
## F.split(F.col("genres"), '[|]', 2) : 마지막 숫자는 최대로 나눌 수 있는 수. -1넣으면 무제한

import pyspark.sql.functions as F
df1 = spark.read.options(header='True').csv("../data/movies.csv")
df1.show(3, False)

+-------+-----------------------+-------------------------------------------+
|movieId|title                  |genres                                     |
+-------+-----------------------+-------------------------------------------+
|1      |Toy Story (1995)       |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)         |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)|Comedy|Romance                             |
+-------+-----------------------+-------------------------------------------+
only showing top 3 rows



In [156]:
# F.split (limit를 2로 지정)
cc = F.split(F.col("genres"), '[|]', 2)
df2 = df1.select(cc.alias("cc"))
df2.show(5, False)

+----------------------------------------------+
|cc                                            |
+----------------------------------------------+
|[Adventure, Animation|Children|Comedy|Fantasy]|
|[Adventure, Children|Fantasy]                 |
|[Comedy, Romance]                             |
|[Comedy, Drama|Romance]                       |
|[Comedy]                                      |
+----------------------------------------------+
only showing top 5 rows



In [157]:
# F.split (limit를 -1로 지정, 무제한)
cc = F.split(F.col("genres"), '[|]', -1)
df2 = df1.select(cc.alias("cc"))
df2.show(5, False)

+-------------------------------------------------+
|cc                                               |
+-------------------------------------------------+
|[Adventure, Animation, Children, Comedy, Fantasy]|
|[Adventure, Children, Fantasy]                   |
|[Comedy, Romance]                                |
|[Comedy, Drama, Romance]                         |
|[Comedy]                                         |
+-------------------------------------------------+
only showing top 5 rows



In [158]:
# F.expr & select(cols)

cols = [F.expr(f"cc[{idx}]") for idx in range(0, 2)]
df3 = df2.select(cols)
df3.show(5, False)

+---------+---------+
|cc[0]    |cc[1]    |
+---------+---------+
|Adventure|Animation|
|Adventure|Children |
|Comedy   |Romance  |
|Comedy   |Drama    |
|Comedy   |null     |
+---------+---------+
only showing top 5 rows



In [167]:
# withColumn + F.lit : 컬럼 생성 후 값 채우기
df4 = df3.withColumn("c3", F.lit("Korea"))
df4.show(5, False)

+---------+---------+-----+
|cc[0]    |cc[1]    |c3   |
+---------+---------+-----+
|Adventure|Animation|Korea|
|Adventure|Children |Korea|
|Comedy   |Romance  |Korea|
|Comedy   |Drama    |Korea|
|Comedy   |null     |Korea|
+---------+---------+-----+
only showing top 5 rows



In [168]:
df5 = df4.withColumn("c4", F.lit("Test"))
df5.show(5, False)

+---------+---------+-----+----+
|cc[0]    |cc[1]    |c3   |c4  |
+---------+---------+-----+----+
|Adventure|Animation|Korea|Test|
|Adventure|Children |Korea|Test|
|Comedy   |Romance  |Korea|Test|
|Comedy   |Drama    |Korea|Test|
|Comedy   |null     |Korea|Test|
+---------+---------+-----+----+
only showing top 5 rows



In [172]:
# explode : 각 행의 list-like값을, 모두 각자 다른 행으로 쪼개어 분리
## 아래 예시로, df2의 첫 행의 데이터라 df6의 1~5행으로 나뉘고, 두번째 행이 6~8로 나뉨
df2.show(5,False)

+-------------------------------------------------+
|cc                                               |
+-------------------------------------------------+
|[Adventure, Animation, Children, Comedy, Fantasy]|
|[Adventure, Children, Fantasy]                   |
|[Comedy, Romance]                                |
|[Comedy, Drama, Romance]                         |
|[Comedy]                                         |
+-------------------------------------------------+
only showing top 5 rows



In [169]:
df6 = df2.select(F.explode(col("cc")))
df6.show(10, False)

+---------+
|col      |
+---------+
|Adventure|
|Animation|
|Children |
|Comedy   |
|Fantasy  |
|Adventure|
|Children |
|Fantasy  |
|Comedy   |
|Romance  |
+---------+
only showing top 10 rows



In [175]:
# drop
df4.show(5)

+---------+---------+-----+
|    cc[0]|    cc[1]|   c3|
+---------+---------+-----+
|Adventure|Animation|Korea|
|Adventure| Children|Korea|
|   Comedy|  Romance|Korea|
|   Comedy|    Drama|Korea|
|   Comedy|     null|Korea|
+---------+---------+-----+
only showing top 5 rows



In [176]:
df4.drop("cc[1]", "c3").show(5, False)

+---------+
|cc[0]    |
+---------+
|Adventure|
|Adventure|
|Comedy   |
|Comedy   |
|Comedy   |
+---------+
only showing top 5 rows



In [177]:
# groupBy & countDistinct
## genres기준으로, title의 Distinct값을 세서(unique)m  titleCount컬럼으로 저장
df7 = df1.groupBy("genres").agg(F.countDistinct("title").alias("titleCount"))
df7.show(5, False)



+------------------------------+----------+
|genres                        |titleCount|
+------------------------------+----------+
|Comedy|Horror|Thriller        |17        |
|Action|Drama|Horror           |1         |
|Adventure|Sci-Fi|Thriller     |4         |
|Action|Animation|Comedy|Sci-Fi|2         |
|Action|Adventure|Drama|Fantasy|6         |
+------------------------------+----------+
only showing top 5 rows





In [178]:
# sort & F.desc
df8 = df7.sort(F.desc("titleCount"))
df8.show(10, False)



+--------------------+----------+
|genres              |titleCount|
+--------------------+----------+
|Drama               |1053      |
|Comedy              |946       |
|Comedy|Drama        |435       |
|Comedy|Romance      |363       |
|Drama|Romance       |349       |
|Documentary         |339       |
|Comedy|Drama|Romance|276       |
|Drama|Thriller      |168       |
|Horror              |167       |
|Horror|Thriller     |135       |
+--------------------+----------+
only showing top 10 rows



                                                                                

## 2. 데이터셋 정제 방법 고민

## 3. 데이터셋 저장 스키마 결정 & 공유