# 서울시 열린데이터 년도별 따릉이 대여건수 합계 계산

## 스파크 기본설정

In [2]:
import os
import sys
#home=os.path.expanduser("~") # HOME이 설정되어 있지 않으면 expanduser('~')를 사용한다.
#osn.environ["PYSPARK_PYTHON"] = "/usr/bin/python"
os.environ["SPARK_HOME"]=os.path.join(os.path.expanduser("~"),r"C:\Users\user\spark-2.0.0-bin-hadoop2.7\spark-2.0.0-bin-hadoop2.7")
os.environ["PYLIB"]=os.path.join(os.environ["SPARK_HOME"],'python','lib')
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'py4j-0.10.1-src.zip'))
sys.path.insert(0,os.path.join(os.environ["PYLIB"],'pyspark.zip'))

In [1]:
import pyspark
from os.path import expanduser, join
from pyspark.sql import SparkSession
from pyspark.sql import Row

myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder\
    .master("local")\
    .appName("myApp")\
    .config(conf=myConf)\
    .appName('csv file reader') \ # for read csv 
    .getOrCreate()

SyntaxError: unexpected character after line continuation character (<ipython-input-1-2691cd54af75>, line 7)

## csv파일 읽어와서 dataframe에 schema 만들어서 적용

In [89]:
from pyspark.sql.types import StructField, StructType, StringType, LongType

myBikeSchema = StructType([
    StructField("RENT_DATE", StringType(), True),
    StructField("RENT_COUNT", LongType(), True)
])

df = spark.read.format('csv')\
    .option("header",'True')\
    .option("encoding", "utf-8")\
    .schema(myBikeSchema)\
    .load('data/seoul_bike_rent_count_utf-8.csv')
df.show()


+----------+----------+
| RENT_DATE|RENT_COUNT|
+----------+----------+
|2018-01-01|      4950|
|2018-01-02|      7136|
|2018-01-03|      7156|
|2018-01-04|      7102|
|2018-01-05|      7705|
|2018-01-06|      5681|
|2018-01-07|      5220|
|2018-01-08|      6309|
|2018-01-09|      5988|
|2018-01-10|      4476|
|2018-01-11|      4337|
|2018-01-12|      4401|
|2018-01-13|      3756|
|2018-01-14|      4675|
|2018-01-15|      6993|
|2018-01-16|      7421|
|2018-01-17|      6990|
|2018-01-18|      7054|
|2018-01-19|      8329|
|2018-01-20|      6148|
+----------+----------+
only showing top 20 rows



## udf함수를 이용하여 Year 컬럼 생성

In [90]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

year_udf = udf(lambda x: x[:4],StringType())
df = df.withColumn("Year",year_udf(df.RENT_DATE))
df.printSchema()
df.show()

root
 |-- RENT_DATE: string (nullable = true)
 |-- RENT_COUNT: long (nullable = true)
 |-- Year: string (nullable = true)

+----------+----------+----+
| RENT_DATE|RENT_COUNT|Year|
+----------+----------+----+
|2018-01-01|      4950|2018|
|2018-01-02|      7136|2018|
|2018-01-03|      7156|2018|
|2018-01-04|      7102|2018|
|2018-01-05|      7705|2018|
|2018-01-06|      5681|2018|
|2018-01-07|      5220|2018|
|2018-01-08|      6309|2018|
|2018-01-09|      5988|2018|
|2018-01-10|      4476|2018|
|2018-01-11|      4337|2018|
|2018-01-12|      4401|2018|
|2018-01-13|      3756|2018|
|2018-01-14|      4675|2018|
|2018-01-15|      6993|2018|
|2018-01-16|      7421|2018|
|2018-01-17|      6990|2018|
|2018-01-18|      7054|2018|
|2018-01-19|      8329|2018|
|2018-01-20|      6148|2018|
+----------+----------+----+
only showing top 20 rows



## agg() 함수를 이용하여 년도별 대여건수 합계

In [91]:
df.groupBy('Year').agg({"Rent_Count":"sum"}).show()

+----+---------------+
|Year|sum(Rent_Count)|
+----+---------------+
|2019|        1871935|
|2018|       10124874|
+----+---------------+

