<a href="https://colab.research.google.com/github/io-uty/2024-spark/blob/main/io-uty/weatherDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [None]:
dfApril = spark.read.csv('202304.csv', header = True, sep = ",")
dfMay = spark.read.csv('202305.csv', header = True, sep = ",")
dfJune = spark.read.csv('202306.csv', header = True, sep = ",")
dfJuly = spark.read.csv('202307.csv', header = True, sep = ",")
dfAugust = spark.read.csv('202308.csv', header = True, sep = ",")
dfSeptember = spark.read.csv('202309.csv', header = True, sep = ",")
dfAll = dfApril.union(dfMay).union(dfJune).union(dfJuly).union(dfAugust).union(dfSeptember)


# **서울의 기온과 폭염특보의 관계 비교**
문제 설명 : 서울(108)의 월별 평균기온과 폭염특보가 뜬 개수를 구해 1개월 단위로 비교하십시오.
지역 : 서울

1. 2023년 4~9월 테이블을 사용합니다.
2. 해당 지역의 최고기온, 평균기온, 최저기온의 각 평균을 구해 분석합니다.
3. 해당 지역에 한 달간 폭염특보가 내려진 횟수를 계산합니다.
4. 폭염특보의 횟수가 높은 순서로 결과를 정렬합니다. 횟수가 동일할 경우 평균기온이 높은 순서대로 정렬합니다.

In [None]:
aprilOfSeoul = dfApril.filter(dfApril['지점'] == '서울(108)')
mayOfSeoul = dfMay.filter(dfMay['지점'] == '서울(108)')
juneOfSeoul = dfJune.filter(dfJune['지점'] == '서울(108)')
julyOfSeoul = dfJuly.filter(dfJuly['지점'] == '서울(108)')
augustOfSeoul = dfAugust.filter(dfAugust['지점'] == '서울(108)')
septemberOfSeoul = dfSeptember.filter(dfSeptember['지점'] == '서울(108)')

In [None]:
from pyspark.sql.functions import avg, sum, when

aprilAvg = spark.createDataFrame([("April",)], ["month"])\
  .join(aprilOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfApril['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))

mayAvg = spark.createDataFrame([("May",)], ["month"])\
  .join(mayOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfMay['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))
juneAvg = spark.createDataFrame([("June",)], ["month"])\
  .join(juneOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfJune['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))
julyAvg = spark.createDataFrame([("July",)], ["month"])\
  .join(julyOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfJuly['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))
augustAvg = spark.createDataFrame([("August",)], ["month"])\
  .join(augustOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfAugust['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))
septemberAvg = spark.createDataFrame([("September",)], ["month"])\
  .join(septemberOfSeoul.agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균기온(°C)").alias("avgtmp"),
    avg("최저기온(°C)").alias("avgLowtmp"),
    sum(when(dfSeptember['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("soHot")
))


In [None]:
dfAvgs = aprilAvg\
  .union(mayAvg)\
  .union(juneAvg)\
  .union(julyAvg)\
  .union(augustAvg)\
  .union(septemberAvg)

dfAvgs.orderBy("soHot", ascending = False).show()

+---------+------------------+------------------+------------------+-----+
|    month|        avgHightmp|            avgtmp|         avgLowtmp|soHot|
+---------+------------------+------------------+------------------+-----+
|   August| 30.76451612903226|27.187096774193545|24.309677419354834|   19|
|     July|30.193548387096772| 26.70645161290323|23.851612903225806|   14|
|     June| 27.85333333333334|23.383333333333333| 19.74666666666667|    3|
|September|27.616666666666674|23.663333333333338|20.233333333333338|    2|
|    April| 18.95333333333333|13.799999999999999| 8.806666666666665|    0|
|      May|25.041935483870965| 19.52258064516129|14.606451612903227|    0|
+---------+------------------+------------------+------------------+-----+



# **자외선 지수와 온도, 습도의 상관관계 분석**

In [None]:
from pyspark.sql.functions import avg, sum, when, lpad, col
dfAll =dfAll.withColumn("일시", lpad(col("일시"), 10, " "))
avgAll = dfAll.groupBy('일시')\
  .agg(
    avg("최고기온(°C)").alias("avgHightmp"),
    avg("평균상대습도(%)").alias("avgHumidity"),
    avg(when(dfAll['자외선지수(단계)'] == '낮음', 1.5)\
        .when(dfAll['자외선지수(단계)'] == '보통', 4.5)\
        .when(dfAll['자외선지수(단계)'] == '높음', 7)\
        .when(dfAll['자외선지수(단계)'] == '매우높음', 9.5)\
        .when(dfAll['자외선지수(단계)'] == '위험', 13)).alias("avgUV")
)\
  .orderBy("일시")
avgHighValue = avgAll.select(avg("avgHightmp")).collect()[0][0]
HOTavgUVValue = avgAll.filter(avgAll['avgHightmp'] >= avgHighValue).select("avgUV").collect()[0][0]
COOLavgUVValue = avgAll.filter(avgAll['avgHightmp'] < avgHighValue).select("avgUV").collect()[0][0]
print("가장 더울 때의 자외선 지수 : ", HOTavgUVValue)
print("가장 시원할 때의 자외선 지수 : ", COOLavgUVValue)
avgHumidityValue = avgAll.select("avgHumidity").collect()[0][0]
HighHumUVValue = avgAll.filter(avgAll['avgHumidity'] >= avgHumidityValue).select("avgUV").collect()[0][0]
LowHumUVValue = avgAll.filter(avgAll['avgHumidity'] < avgHumidityValue).select("avgUV").collect()[0][0]
print("가장 습할 때의 자외선 지수 : ", HighHumUVValue)
print("가장 뽀송할 때의 자외선 지수 : ", LowHumUVValue)

가장 더울 때의 자외선 지수 :  8.697530864197532
가장 시원할 때의 자외선 지수 :  6.814814814814815
가장 습할 때의 자외선 지수 :  6.814814814814815
가장 뽀송할 때의 자외선 지수 :  7.0


# **매 월마다 폭염특보가 가장 많이 발효된 지역**

In [None]:
heatApril = dfApril.groupBy('지점')\
.agg(sum(when(dfApril['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count", "지점", ascending = False)\

heatMay = dfMay.groupBy('지점')\
.agg(sum(when(dfMay['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count","지점", ascending = False)\

heatJune = dfJune.groupBy('지점')\
.agg(sum(when(dfJune['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count", "지점",ascending = False)\

heatJuly = dfJuly.groupBy('지점')\
.agg(sum(when(dfJuly['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count","지점", ascending = False)\

heatAugust = dfAugust.groupBy('지점')\
.agg(sum(when(dfAugust['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count", "지점",ascending = False)\

heatSeptember =dfSeptember.groupBy('지점')\
.agg(sum(when(dfSeptember['폭염특보(O/X)'] == 'O', 1).otherwise(0)).alias("heatwave_count"))\
.orderBy("heatwave_count", "지점",ascending = False)\



In [None]:
april = spark.createDataFrame([("4",)], ["month"])\
  .join(heatApril.limit(1)) # Use limit(1) to get a DataFrame with one row
may = spark.createDataFrame([("5",)], ["month"])\
  .join(heatMay.limit(1))
june = spark.createDataFrame([("6",)], ["month"])\
  .join(heatJune.limit(1))
july = spark.createDataFrame([("7",)], ["month"])\
  .join(heatJuly.limit(1))
august = spark.createDataFrame([("8",)], ["month"])\
  .join(heatAugust.limit(1))
september = spark.createDataFrame([("9",)], ["month"])\
  .join(heatSeptember.limit(1))
dfAvgs = april\
  .union(may)\
  .union(june)\
  .union(july)\
  .union(august)\
  .union(september)

dfAvgs.orderBy("heatwave_count", ascending = False).show()

+-----+-----------+--------------+
|month|       지점|heatwave_count|
+-----+-----------+--------------+
|    8|  합천(285)|            26|
|    7|  합천(285)|            20|
|    9|  화순(741)|             6|
|    6|  홍천(212)|             6|
|    4|흑산도(169)|             0|
|    5|흑산도(169)|             0|
+-----+-----------+--------------+



# **대한민국 행정구역별 평균 온도 분석**

In [None]:
regions = {"서울특별시": ["108"],
"인천광역시": ["112"],
"부산광역시": ["159"],
"대구광역시": ["143"],
"대전광역시": ["133"],
"울산광역시": ["152"],
"세종특별자치시": ["239"],
"경기도": [
    119, 202, 203, 551, 549, 434, 433, 437, 438, 441, 444, 445,
    505, 504, 516, 532, 540, 541, 545, 546, 548, 550, 555, 556,
    565, 569, 571, 572, 590, 598, 601, 602, 603, 604
],
"강원도": [
    104, 106, 114, 115, 121, 211, 212, 216, 217, 526, 555, 556,
    606, 90, 876, 93
],
"충청북도": [127, 131, 221, 226, 601, 602, 603, 604],
"충청남도": [
    129, 235, 236, 238, 232, 612, 615, 616, 618, 619, 627, 628,
    634, 636
],
"전라북도": [140, 146, 702, 245, 247, 248, 254, 734, 737],
"전라남도": [
    165, 168, 169, 170, 172, 712, 713, 730, 731, 732, 706, 709,
    710, 741, 754, 768, 789, 259, 260, 261, 262
],
"경상북도": [
    135, 136, 137, 138, 271, 272, 273, 276, 277, 278, 279, 281,
    283, 284, 285, 288, 289, 294, 295, 801, 810, 812, 813, 815,
    822, 823, 825, 827
],
"경상남도": [155, 162, 192, 253, 257, 264, 263, 268, 920],
"제주특별자치도": ["184"]}

In [None]:
allTmp = dfAll.groupby("지점")\
  .agg(avg("평균기온(°C)").alias("avgTmp"))

allTmp.orderBy("avgTmp", ascending=False).show()

+-----------+------------------+
|       지점|            avgTmp|
+-----------+------------------+
|  제주(184)|23.403888888888886|
|  청주(131)|23.071584699453553|
|양산시(257)| 22.91639344262295|
|  광명(437)|22.914285714285718|
|  대구(143)|22.661202185792348|
|순천시(712)|22.586263736263735|
|  밀양(288)|22.568681318681318|
|광양읍(713)|22.550273224043714|
|  광주(156)|22.521857923497272|
|  경산(827)|22.502747252747252|
|  포항(138)|22.468306010928956|
|  전주(146)|22.444505494505496|
|  칠곡(825)| 22.43934426229508|
|  창녕(919)|22.426775956284153|
|  서울(108)|22.411475409836072|
|의령군(263)| 22.36353591160221|
|김해시(253)|22.278688524590162|
|  성주(810)|22.240437158469945|
|  옥천(604)| 22.22252747252747|
|  논산(615)| 22.19344262295082|
+-----------+------------------+
only showing top 20 rows



# **폭염 & 자외선 기준으로 최고체감온도 집계하기**

In [None]:
columns_to_check = [col for col in dfAll.columns if col != '폭염영향예보(단계)']
filtered_df = dfAll.dropna(subset=columns_to_check)

filtered_df.count()

27335

In [None]:
import pyspark.sql.functions as F

heatOX = filtered_df.groupBy(filtered_df['폭염여부(O/X)'])\
    .agg(
        F.count(F.lit(1)).alias("heatwave_count"),
        F.min(filtered_df["최고체감온도(°C)"]).alias("minTmp"),
        F.avg(filtered_df["최고체감온도(°C)"]).alias("avgTmp"),
        F.max(filtered_df["최고체감온도(°C)"]).alias("maxTmp")
    )
heatOX.show()

heatWarningOX = filtered_df.groupBy(filtered_df['폭염특보(O/X)'])\
    .agg(
        F.count(F.lit(1)).alias("heatWarning_count"),
        F.min(filtered_df["최고체감온도(°C)"]).alias("minTmp"),
        F.avg(filtered_df["최고체감온도(°C)"]).alias("avgTmp"),
        F.max(filtered_df["최고체감온도(°C)"]).alias("maxTmp")
    )
heatWarningOX.show()
heatImpact = filtered_df.groupBy(filtered_df['폭염영향예보(단계)'])\
    .agg(
        F.count(F.lit(1)).alias("heatwaveImpact_count"),
        F.min(filtered_df["최고체감온도(°C)"]).alias("minTmp"),
        F.avg(filtered_df["최고체감온도(°C)"]).alias("avgTmp"),
        F.max(filtered_df["최고체감온도(°C)"]).alias("maxTmp")
    )
heatImpact.show()
UVLevel = filtered_df.groupBy(filtered_df['자외선지수(단계)'])\
    .agg(
        F.count(F.lit(1)).alias("UVlevel_count"),
        F.max(filtered_df["평균기온(°C)"]).alias("maxTmp"),
        F.avg(filtered_df["평균기온(°C)"]).alias("avgTmp"),
        F.min(filtered_df["평균기온(°C)"]).alias("minTmp")
    )
UVLevel.show()

+-------------+--------------+------+------------------+------+
|폭염여부(O/X)|heatwave_count|minTmp|            avgTmp|maxTmp|
+-------------+--------------+------+------------------+------+
|            O|          3302|    33|34.159721380981225|  37.8|
|            X|         24033|   1.7|25.538026879707047|   9.9|
+-------------+--------------+------+------------------+------+

+-------------+-----------------+------+------------------+------+
+-------------+-----------------+------+------------------+------+
|            O|             5625|  25.4|33.105831111111115|  37.8|
|            X|            21710|   1.7| 24.88855366190694|   9.9|
+-------------+-----------------+------+------------------+------+

+------------------+--------------------+------+------------------+------+
|폭염영향예보(단계)|heatwaveImpact_count|minTmp|            avgTmp|maxTmp|
+------------------+--------------------+------+------------------+------+
|                  |               18514|   1.7|23.892459760181477