In [2]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
spark = SparkSession.builder.master("local").appName("test").getOrCreate()

In [3]:
stocks=[('Google', 'GOOGL', 'USA', 2984, 'USD'),
('Netfilx', 'NFLX', 'USA', 645, 'USD'),
('Amazon', 'AMZN', 'USA', 3518, 'USD'),
('Tesla', 'TSLA', 'USA', 1222, 'USD'),
('Samsung', '005930', 'Korea', 70600, 'KRW'),
('Kakao', '035720', 'Korea', 125000, 'KRW')]
schema = ["name", "ticker", "country", "price", "currency"]
df = spark.createDataFrame(data=stocks, schema=schema) #data type 은 자동 할당 됨

In [5]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- ticker: string (nullable = true)
 |-- country: string (nullable = true)
 |-- price: long (nullable = true)
 |-- currency: string (nullable = true)



In [6]:
# select
usaStocksDf = df.select("name", "country", "price").where("country == 'USA'").orderBy("price")
usaStocksDf.show()

+-------+-------+-----+
|   name|country|price|
+-------+-------+-----+
|Netfilx|    USA|  645|
|  Tesla|    USA| 1222|
| Google|    USA| 2984|
| Amazon|    USA| 3518|
+-------+-------+-----+



In [7]:
#groupBy
df.groupBy("currency").max("price").show() # currency별 각 최고가격

+--------+----------+
|currency|max(price)|
+--------+----------+
|     KRW|    125000|
|     USD|      3518|
+--------+----------+



In [9]:
from pyspark.sql.functions import avg, count
df.groupBy("currency").agg(avg("price")).show() # currency별 그룹핑 후 각 평균가격

+--------+----------+
|currency|avg(price)|
+--------+----------+
|     KRW|   97800.0|
|     USD|   2092.25|
+--------+----------+



In [10]:
df.groupBy("currency").agg(count("price")).show()

+--------+------------+
|currency|count(price)|
+--------+------------+
|     KRW|           2|
|     USD|           4|
+--------+------------+

