## PySpark 설치

In [None]:
!pip install pyspark==3.3.1 py4j==0.10.9.5 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 처리할 데이터 파일을 먼저 다운로드 받아온다

In [1]:
!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/1800.csv

--2023-07-04 12:11:41--  https://s3-geospatial.s3-us-west-2.amazonaws.com/1800.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 3.5.83.169, 3.5.84.174, 3.5.81.19, ...
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|3.5.83.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 62728 (61K) [text/csv]
Saving to: ‘1800.csv’


2023-07-04 12:11:41 (388 KB/s) - ‘1800.csv’ saved [62728/62728]



In [1]:
!ls -tl

total 216
-rw-r--r-- 1 jovyan users 22881 Jul  4 12:34 PySpark_설치_및_테스트.ipynb
-rw-r--r-- 1 jovyan users 25991 Jul  4 12:29 PySpark_DataFrame_1.ipynb
-rw-r--r-- 1 jovyan users 22056 Jul  4 12:19 PySpark_DataFrame_5.ipynb
-rw-r--r-- 1 jovyan users 32483 Jul  4 12:19 PySpark_DataFrame_4.ipynb
-rw-r--r-- 1 jovyan users 14392 Jul  4 12:19 PySpark_DataFrame_3.ipynb
-rw-r--r-- 1 jovyan users 25291 Jul  4 12:19 PySpark_DataFrame_2.ipynb
-rw-r--r-- 1 jovyan users 64553 Jul  4 12:19 1800.csv


In [2]:
!head -5 1800.csv

ITE00100554,18000101,TMAX,-75,,,E,
ITE00100554,18000101,TMIN,-148,,,E,
GM000010962,18000101,PRCP,0,,,E,
EZE00100082,18000101,TMAX,-86,,,E,
EZE00100082,18000101,TMIN,-135,,,E,


## 판다스 데이터프레임으로 처리해본다

In [1]:
import pandas as pd 

pd_df = pd.read_csv(
    "1800.csv",
    names=["stationID", "date", "measure_type", "temperature"],
    usecols=[0, 1, 2, 3]
)

In [2]:
pd_df.head()

Unnamed: 0,stationID,date,measure_type,temperature
0,ITE00100554,18000101,TMAX,-75
1,ITE00100554,18000101,TMIN,-148
2,GM000010962,18000101,PRCP,0
3,EZE00100082,18000101,TMAX,-86
4,EZE00100082,18000101,TMIN,-135


In [3]:
# Filter out all but TMIN entries
pd_minTemps = pd_df[pd_df['measure_type'] == "TMIN"]

In [4]:
pd_minTemps.head()

Unnamed: 0,stationID,date,measure_type,temperature
1,ITE00100554,18000101,TMIN,-148
4,EZE00100082,18000101,TMIN,-135
6,ITE00100554,18000102,TMIN,-125
9,EZE00100082,18000102,TMIN,-130
11,ITE00100554,18000103,TMIN,-46


In [5]:
# Select only stationID and temperature
pd_stationTemps = pd_minTemps[["stationID", "temperature"]]

In [6]:
# Aggregate to find minimum temperature for every station
pd_minTempsByStation = pd_stationTemps.groupby(["stationID"]).min("temperature")
pd_minTempsByStation.head()

Unnamed: 0_level_0,temperature
stationID,Unnamed: 1_level_1
EZE00100082,-135
ITE00100554,-148


## Spark으로 처리해본다

- spark와 pandas의 큰 차이는 서버의 개수 차이
- pandas는 한대의 서버, spark는 다수의 서버
- 처리할 수 있는 데이터의 크기 차이

In [35]:
!lscpu

Architecture:          aarch64
  CPU op-mode(s):      64-bit
  Byte Order:          Little Endian
CPU(s):                8
  On-line CPU(s) list: 0-7
Vendor ID:             0x00
  Model:               0
  Thread(s) per core:  1
  Core(s) per cluster: 8
  Socket(s):           -
  Cluster(s):          1
  Stepping:            0x0
  BogoMIPS:            48.00
  Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp a
                       simdhp cpuid asimdrdm jscvt fcma lrcpc dcpop sha3 asimddp
                        sha512 asimdfhm dit uscat ilrcpc flagm ssbs sb dcpodp fl
                       agm2 frint
Vulnerabilities:       
  Itlb multihit:       Not affected
  L1tf:                Not affected
  Mds:                 Not affected
  Meltdown:            Not affected
  Mmio stale data:     Not affected
  Retbleed:            Not affected
  Spec store bypass:   Mitigation; Speculative Store Bypass disabled via prctl
  Spectre v1:          Mitigation; __user poi

In [34]:
!grep MemTotal /proc/meminfo

MemTotal:        6069428 kB


In [7]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set("spark.app.name", "PySpark DataFrame #1")
conf.set("spark.master", "local[*]")

spark = SparkSession.builder\
        .config(conf=conf)\
        .getOrCreate()

In [36]:
spark

In [8]:
df = spark.read.format("csv").load("1800.csv") # spark.read.csv("1800.csv")

In [9]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [37]:
# dataframe의 이름을 지정하기
df = spark.read.format("csv")\
    .load("1800.csv")\
    .toDF("stationID", "date", "measure_type", "temperature", "_c4", "_c5", "_c6", "_c7")

In [38]:
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: string (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [41]:
# option -> infer schema 추가
df = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .load("1800.csv")\
    .toDF("stationID", "date", "measure_type", "temperature", "_c4", "_c5", "_c6", "_c7")

In [42]:
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: integer (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)



In [46]:
from pyspark.sql.types import StringType, IntegerType, FloatType
from pyspark.sql.types import StructType, StructField

# StructType을 통한 칼럼 이름, 타입, null 값 허용 설정
schema = StructType([ \
                     StructField("stationID", StringType(), True), \
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True)])

In [47]:
# df = spark.read.schema(schema).format("csv").load("1800.csv")
df = spark.read.schema(schema).csv("1800.csv")

In [48]:
df.printSchema()

root
 |-- stationID: string (nullable = true)
 |-- date: integer (nullable = true)
 |-- measure_type: string (nullable = true)
 |-- temperature: float (nullable = true)



In [49]:
# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

In [50]:
minTemps.count()

730

In [51]:
# Column expression으로 필터링 적용
minTemps = df.where(df.measure_type == "TMIN")

In [52]:
minTemps.count()

730

In [61]:
# SQL expression으로 필터링 적용
minTemps = df.where("measure_type = 'TMIN'")

In [62]:
minTemps.count()

730

In [63]:
# Aggregate to find minimum temperature for every station
minTempsByStation = minTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

+-----------+----------------+
|  stationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [64]:
# Select only stationID and temperature
stationTemps = minTemps[["stationID", "temperature"]]

In [65]:
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



In [58]:
stationTemps = minTemps.select("stationID", "temperature")

In [27]:
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



In [28]:
stationTemps.show(5)

+-----------+-----------+
|  stationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



In [66]:
# Collect, format, and print the results
results = minTempsByStation.collect()

In [67]:
for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

ITE00100554	-148.00F
EZE00100082	-135.00F


- DataFrame의 컬럼을 지칭하는 방식
    ```python
    from pyspark.sql.functions import col, column
    stationTemps = minTemps.select(
     "stationID",
     col("stationID"),
     column("stationID"),
     minTemps.stationID
    )
    ```

## Spark SQL로 처리해보기

pyspark.sql.types <br>
❖ IntegerType <br>
❖ LongType <br>
❖ FloatType <br>
❖ StringType <br>
❖ BooleanType <br>
❖ TimestampType <br>
❖ DateType <br>
❖ ArrayType <br>
❖ StructType <br>
❖ StructField <br>
❖ MapType <br> 

In [68]:
df.createOrReplaceTempView("station1800")

In [69]:
results = spark.sql("""SELECT stationID, MIN(temperature)
FROM station1800
WHERE measure_type = 'TMIN'
GROUP BY 1""").collect()

In [70]:
# pyspark.sql.Row는 DataFrame의 레코드에 해당하며 필드별로 이름이 존재# 
for r in results:
    print(r)

Row(stationID='ITE00100554', min(temperature)=-148.0)
Row(stationID='EZE00100082', min(temperature)=-135.0)
