# Analyzing Weather Data

## Packages and Presets

In [26]:

import kaggle
import polars as pl
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


import plotnine as pn



%load_ext blackcellmagic

## Fetching the Data

In [3]:
dataset = 'guillemservera/global-daily-climate-data'
# Download the dataset
kaggle.api.dataset_download_files(dataset, path='datasets/', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/guillemservera/global-daily-climate-data


In [6]:
cities = pl.read_csv('datasets/cities.csv')
countries = pl.read_csv('datasets/countries.csv')
daily_weather = pl.read_parquet('datasets/daily_weather.parquet')

## EDA

In [15]:
print(cities.shape)
cities.head()


(1245, 8)


station_id,city_name,country,state,iso2,iso3,latitude,longitude
str,str,str,str,str,str,f64,f64
"""41515""","""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005
"""38954""","""Fayzabad""","""Afghanistan""","""Badakhshan""","""AF""","""AFG""",37.129761,70.579247
"""41560""","""Jalalabad""","""Afghanistan""","""Nangarhar""","""AF""","""AFG""",34.441527,70.436103
"""38947""","""Kunduz""","""Afghanistan""","""Kunduz""","""AF""","""AFG""",36.727951,68.87253
"""38987""","""Qala i Naw""","""Afghanistan""","""Badghis""","""AF""","""AFG""",34.983,63.1333


In [17]:
print(countries.shape)
countries.head()

(214, 11)


country,native_name,iso2,iso3,population,area,capital,capital_lat,capital_lng,region,continent
str,str,str,str,f64,f64,str,f64,f64,str,str
"""Afghanistan""","""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
"""Albania""","""Shqipëria""","""AL""","""ALB""",2895947.0,28748.0,"""Tirana""",41.326873,19.818791,"""Southern Europ…","""Europe"""
"""Algeria""","""الجزائر""","""DZ""","""DZA""",38700000.0,2381741.0,"""Algiers""",36.775361,3.060188,"""Northern Afric…","""Africa"""
"""American Samoa…","""American Samoa…","""AS""","""ASM""",55519.0,199.0,"""Pago Pago""",-14.275479,-170.70483,"""Polynesia""","""Oceania"""
"""Angola""","""Angola""","""AO""","""AGO""",24383301.0,1246700.0,"""Luanda""",-8.82727,13.243951,"""Central Africa…","""Africa"""


In [19]:
print(daily_weather.shape)
daily_weather.head()

(27635763, 15)


station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min,__index_level_0__
cat,cat,datetime[ns],cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""41515""","""Asadabad""",1957-07-01 00:00:00,"""Summer""",27.0,21.1,35.6,0.0,,,,,,,0
"""41515""","""Asadabad""",1957-07-02 00:00:00,"""Summer""",22.8,18.9,32.2,0.0,,,,,,,1
"""41515""","""Asadabad""",1957-07-03 00:00:00,"""Summer""",24.3,16.7,35.6,1.0,,,,,,,2
"""41515""","""Asadabad""",1957-07-04 00:00:00,"""Summer""",26.6,16.1,37.8,4.1,,,,,,,3
"""41515""","""Asadabad""",1957-07-05 00:00:00,"""Summer""",30.8,20.0,41.7,0.0,,,,,,,4


In [30]:
cities.group_by("country").agg(pl.count("country").alias("count")).sort(
    "count", descending=True
).head(10)

country,count
str,u32
"""Russia""",77
"""United States …",49
"""Turkey""",44
"""Thailand""",38
"""India""",29
"""Algeria""",27
"""Iran""",26
"""China""",25
"""Mexico""",25
"""Indonesia""",24


We can see that both the cities dataaframe and the daily_weather dataframe have a column called "station_id" meaning that we can join the two dataframes on this column. Additionally, we can add information about the countries by joining the cities dataframe with the countries dataframe on the "country" column.

In [36]:
daily_weather = daily_weather.with_columns([
    (pl.col("station_id").cast(pl.Utf8)).alias("station_id"),
])
weather_country = daily_weather.join(cities, on="station_id", how="left")

In [39]:
weather_full = weather_country.join(countries, on="country", how="left")
print(weather_full.shape)
weather_full.head()

(28388191, 32)


station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min,__index_level_0__,city_name_right,country,state,iso2,iso3,latitude,longitude,native_name,iso2_right,iso3_right,population,area,capital,capital_lat,capital_lng,region,continent
str,cat,datetime[ns],cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,str,str,str,str,str,f64,f64,str,str,str,f64,f64,str,f64,f64,str,str
"""41515""","""Asadabad""",1957-07-01 00:00:00,"""Summer""",27.0,21.1,35.6,0.0,,,,,,,0,"""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005,"""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
"""41515""","""Asadabad""",1957-07-02 00:00:00,"""Summer""",22.8,18.9,32.2,0.0,,,,,,,1,"""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005,"""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
"""41515""","""Asadabad""",1957-07-03 00:00:00,"""Summer""",24.3,16.7,35.6,1.0,,,,,,,2,"""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005,"""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
"""41515""","""Asadabad""",1957-07-04 00:00:00,"""Summer""",26.6,16.1,37.8,4.1,,,,,,,3,"""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005,"""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
"""41515""","""Asadabad""",1957-07-05 00:00:00,"""Summer""",30.8,20.0,41.7,0.0,,,,,,,4,"""Asadabad""","""Afghanistan""","""Kunar""","""AF""","""AFG""",34.866,71.150005,"""افغانستان""","""AF""","""AFG""",26023100.0,652230.0,"""Kabul""",34.526011,69.177684,"""Southern and C…","""Asia"""
