In [None]:
pip --version

pip 24.1.2 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)


In [None]:
!pip install pyspark py4j



In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, StringType
from pyspark.sql.functions import col, when, year, month, avg, sum as _sum

# Создание объекта SparkSession
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .config("spark.master", "local[*]") \
    .getOrCreate()


In [30]:
# Данные из csv
df = spark.read.csv("/content/sample_data/weather_data2.csv", header=True, inferSchema=True)

df.show(5) # обратить внимание на station_1|2019-01-01 (temperature тут NULL)



+----------+----------+------------------+------------------+-----------------+
|station_id|      date|       temperature|     precipitation|       wind_speed|
+----------+----------+------------------+------------------+-----------------+
| station_1|2019-01-01|              NULL| 23.67004407474563|5.458999894629757|
| station_7|2022-06-28|-6.751842212861652| 23.67004407474563|5.458999894629757|
| station_4|2020-04-07| -9.57484426026233|2.9858244485444665|6.828505392085966|
| station_8|2018-12-22| 19.34342035369938| 33.58211407730149|8.975576079892296|
| station_5|2021-09-09|30.880953114964086|29.110437988411558|18.26465360842913|
+----------+----------+------------------+------------------+-----------------+
only showing top 5 rows



In [31]:
df.dtypes

[('station_id', 'string'),
 ('date', 'date'),
 ('temperature', 'double'),
 ('precipitation', 'double'),
 ('wind_speed', 'double')]

In [32]:
# Проверяем тип данных столбца 'date' с помощью встроенной ф-ции python
# isinstance(object, classinfo)
# и если это не DateType, то изменим ему тип

if not isinstance(df.schema['date'].dataType, DateType):
    df = df.withColumn('date', col('date').cast(DateType()))

df.printSchema()
df.show(5)


root
 |-- station_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- temperature: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- wind_speed: double (nullable = true)

+----------+----------+------------------+------------------+-----------------+
|station_id|      date|       temperature|     precipitation|       wind_speed|
+----------+----------+------------------+------------------+-----------------+
| station_1|2019-01-01|              NULL| 23.67004407474563|5.458999894629757|
| station_7|2022-06-28|-6.751842212861652| 23.67004407474563|5.458999894629757|
| station_4|2020-04-07| -9.57484426026233|2.9858244485444665|6.828505392085966|
| station_8|2018-12-22| 19.34342035369938| 33.58211407730149|8.975576079892296|
| station_5|2021-09-09|30.880953114964086|29.110437988411558|18.26465360842913|
+----------+----------+------------------+------------------+-----------------+
only showing top 5 rows



In [33]:
# Заполнение пропущенных значений средними значениями по метеостанциям

# Для этого сначала создадим новый DataFrame avg_val_for_station
# в котором вычислим средние значения для каждого столбца

avg_val_for_station = df.groupBy('station_id').agg( # agg позволяет агрегировать несколько ф-ций
    avg('temperature').alias('avg_temp'),
    avg('precipitation').alias('avg_prec'),
    avg('wind_speed').alias('avg_wind_speed')
)

avg_val_for_station.show()


+----------+------------------+------------------+-----------------+
|station_id|          avg_temp|          avg_prec|   avg_wind_speed|
+----------+------------------+------------------+-----------------+
| station_3| 9.740896689073265| 29.30794836488954|9.549718299425892|
| station_2|13.548804373583142| 23.87917025369124|9.442878291606355|
| station_6|7.4142338849559595|26.324503886442812|8.838197472682314|
| station_1| 7.592646639928555| 26.11867608871683|10.47702484270486|
|station_10|10.315596733581119| 27.47307945637249|9.189041643479774|
| station_7|11.346867821763226|26.187226363752927|9.483246108111171|
| station_4|11.525432311947469|24.391630446521486|11.11827270875864|
| station_8| 9.303162647953123|25.400828618016806|8.415283574279137|
| station_5|  9.02260103202405|24.637566483956036|9.668542320392868|
| station_9|10.904335933104583| 25.38019287227167|9.327308366972966|
+----------+------------------+------------------+-----------------+



In [34]:
# Объединяем исходный DataFrame со средними значениями

df = df.join(avg_val_for_station, on='station_id', how='left')

df.show(5)

+----------+----------+------------------+------------------+-----------------+------------------+------------------+-----------------+
|station_id|      date|       temperature|     precipitation|       wind_speed|          avg_temp|          avg_prec|   avg_wind_speed|
+----------+----------+------------------+------------------+-----------------+------------------+------------------+-----------------+
| station_1|2019-01-01|              NULL| 23.67004407474563|5.458999894629757| 7.592646639928555| 26.11867608871683|10.47702484270486|
| station_7|2022-06-28|-6.751842212861652| 23.67004407474563|5.458999894629757|11.346867821763226|26.187226363752927|9.483246108111171|
| station_4|2020-04-07| -9.57484426026233|2.9858244485444665|6.828505392085966|11.525432311947469|24.391630446521486|11.11827270875864|
| station_8|2018-12-22| 19.34342035369938| 33.58211407730149|8.975576079892296| 9.303162647953123|25.400828618016806|8.415283574279137|
| station_5|2021-09-09|30.880953114964086|29.110

In [35]:
# Заполним пропущенных значений средними значениями. when().otherwise()
# -- обратить внимание на station_1|2019-01-01
df = df.withColumn(
    'temperature',
    when(col('temperature').isNull(), col('avg_temp')).otherwise(col('temperature'))
).withColumn(
    'precipitation',
    when(col('precipitation').isNull(), col('avg_prec')).otherwise(col('precipitation'))
).withColumn(
    'wind_speed',
    when(col('wind_speed').isNull(), col('avg_wind_speed')).otherwise(col('precipitation'))
)

df.show(5)


+----------+----------+------------------+------------------+------------------+------------------+------------------+-----------------+
|station_id|      date|       temperature|     precipitation|        wind_speed|          avg_temp|          avg_prec|   avg_wind_speed|
+----------+----------+------------------+------------------+------------------+------------------+------------------+-----------------+
| station_1|2019-01-01| 7.592646639928555| 23.67004407474563| 23.67004407474563| 7.592646639928555| 26.11867608871683|10.47702484270486|
| station_7|2022-06-28|-6.751842212861652| 23.67004407474563| 23.67004407474563|11.346867821763226|26.187226363752927|9.483246108111171|
| station_4|2020-04-07| -9.57484426026233|2.9858244485444665|2.9858244485444665|11.525432311947469|24.391630446521486|11.11827270875864|
| station_8|2018-12-22| 19.34342035369938| 33.58211407730149| 33.58211407730149| 9.303162647953123|25.400828618016806|8.415283574279137|
| station_5|2021-09-09|30.880953114964086

In [36]:
# Удаляем временные столбцы со средними значениями
df = df.drop('avg_temp', 'avg_prec', 'avg_wind_speed')

df.show(5)

+----------+----------+------------------+------------------+------------------+
|station_id|      date|       temperature|     precipitation|        wind_speed|
+----------+----------+------------------+------------------+------------------+
| station_1|2019-01-01| 7.592646639928555| 23.67004407474563| 23.67004407474563|
| station_7|2022-06-28|-6.751842212861652| 23.67004407474563| 23.67004407474563|
| station_4|2020-04-07| -9.57484426026233|2.9858244485444665|2.9858244485444665|
| station_8|2018-12-22| 19.34342035369938| 33.58211407730149| 33.58211407730149|
| station_5|2021-09-09|30.880953114964086|29.110437988411558|29.110437988411558|
+----------+----------+------------------+------------------+------------------+
only showing top 5 rows



In [37]:
# топ-5 самых жарких дней за все время наблюдений.
hot_days = df.orderBy(df.temperature.desc())\
  .limit(5).select('date', 'temperature')
hot_days.show()

+----------+------------------+
|      date|       temperature|
+----------+------------------+
|2021-08-20|39.982828249354846|
|2023-12-02| 39.96797489293784|
|2022-03-28|  39.8246894248997|
|2019-02-11| 39.76737697836647|
|2020-06-10| 39.69147838355929|
+----------+------------------+



In [27]:
df.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- temperature: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- wind_speed: double (nullable = true)



In [44]:
#метеостанция с наибольшим количеством осадков за последний год.

last_year = df.sort(df.date.desc()) \
  .limit(1) \
  .select(year('date').alias('year')).collect()[0]['year'] # тут получаем
# DataFrame DataFrame[year: int], и с помощью collect() 'делаем' его списком
#[Row(year=2023)], из которого берем берем первый элимент[0] Row(year=2023)
#и дергаем оттуда год ['year'] в итоге тип last_year будет int и равен 2023

station_precip = df.filter(year('date') == last_year)\
  .groupBy('station_id') \
  .agg(_sum('precipitation').alias('sum_prec')) \
  .orderBy(col('sum_prec').desc()) \
  .limit(1)

station_precip.show()



2023


In [13]:
# средняя температура по месяцам за все время наблюдений.

month_avg_temp = df.withColumn('month', month('date'))\
  .groupBy('month').agg(avg('temperature').alias('temperature'))\
  .orderBy('month')

month_avg_temp.show()

+-----+------------------+
|month|       temperature|
+-----+------------------+
|    1|11.313747191839138|
|    2| 9.067229891101926|
|    3| 7.244080205633994|
|    4|12.024529009744693|
|    5| 9.902883346912718|
|    6|13.421092297254138|
|    7|6.1857183016954576|
|    8|  10.9678002814186|
|    9| 9.596744236573942|
|   10|  9.09884344821895|
|   11| 7.265889994697494|
|   12|11.218592100674337|
+-----+------------------+



In [137]:
spark.stop()