# 1. Importing Required Libraries

In [1]:
import pandas as pd

# 2. Getting Data
## Weather data is from Kaggle: https://www.kaggle.com/datasets/guillemservera/global-daily-climate-data

In [2]:
countries = pd.read_csv("countries.csv")
cities = pd.read_csv("cities.csv")
daily_weather = pd.read_parquet("daily_weather.parquet")

In [3]:
daily_weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27635763 entries, 0 to 24220
Data columns (total 14 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   station_id              category      
 1   city_name               category      
 2   date                    datetime64[ns]
 3   season                  category      
 4   avg_temp_c              float64       
 5   min_temp_c              float64       
 6   max_temp_c              float64       
 7   precipitation_mm        float64       
 8   snow_depth_mm           float64       
 9   avg_wind_dir_deg        float64       
 10  avg_wind_speed_kmh      float64       
 11  peak_wind_gust_kmh      float64       
 12  avg_sea_level_pres_hpa  float64       
 13  sunshine_total_min      float64       
dtypes: category(3), datetime64[ns](1), float64(10)
memory usage: 2.6 GB


In [4]:
daily_weather.head()

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh,peak_wind_gust_kmh,avg_sea_level_pres_hpa,sunshine_total_min
0,41515,Asadabad,1957-07-01,Summer,27.0,21.1,35.6,0.0,,,,,,
1,41515,Asadabad,1957-07-02,Summer,22.8,18.9,32.2,0.0,,,,,,
2,41515,Asadabad,1957-07-03,Summer,24.3,16.7,35.6,1.0,,,,,,
3,41515,Asadabad,1957-07-04,Summer,26.6,16.1,37.8,4.1,,,,,,
4,41515,Asadabad,1957-07-05,Summer,30.8,20.0,41.7,0.0,,,,,,


# 3. Subsetting the data to target location, variables, and dropping Null values
## Target location is Jefferson City 
## Target variables are date, season, average temp, min temp, max temp, average wind speed, and average sea level pressure

In [5]:
selected_city = ['Jefferson City']
selection_boolean = (daily_weather['city_name'].isin(selected_city))
weather_JF_df = daily_weather[selection_boolean]
weather_JF_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19295 entries, 0 to 19294
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   station_id              19295 non-null  category      
 1   city_name               19295 non-null  category      
 2   date                    19295 non-null  datetime64[ns]
 3   season                  19295 non-null  category      
 4   avg_temp_c              17778 non-null  float64       
 5   min_temp_c              19291 non-null  float64       
 6   max_temp_c              19293 non-null  float64       
 7   precipitation_mm        19292 non-null  float64       
 8   snow_depth_mm           19100 non-null  float64       
 9   avg_wind_dir_deg        11981 non-null  float64       
 10  avg_wind_speed_kmh      17884 non-null  float64       
 11  peak_wind_gust_kmh      9386 non-null   float64       
 12  avg_sea_level_pres_hpa  17072 non-null  float6

In [6]:
weather_JF_df.apply(pd.isnull).sum()

station_id                   0
city_name                    0
date                         0
season                       0
avg_temp_c                1517
min_temp_c                   4
max_temp_c                   2
precipitation_mm             3
snow_depth_mm              195
avg_wind_dir_deg          7314
avg_wind_speed_kmh        1411
peak_wind_gust_kmh        9909
avg_sea_level_pres_hpa    2223
sunshine_total_min        8325
dtype: int64

In [7]:
core_weather = weather_JF_df[["date", "season","avg_temp_c", "min_temp_c", "max_temp_c", "precipitation_mm",\
                               "avg_wind_speed_kmh","avg_sea_level_pres_hpa"]].copy()
core_weather

Unnamed: 0,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,avg_wind_speed_kmh,avg_sea_level_pres_hpa
0,1969-11-01,Autumn,,4.4,7.8,0.0,,
1,1969-11-02,Autumn,,4.4,6.7,0.0,,
2,1969-11-03,Autumn,,-0.6,6.7,0.3,,
3,1969-11-04,Autumn,,-0.6,7.2,0.0,,
4,1969-11-05,Autumn,,-1.1,18.3,0.0,,
...,...,...,...,...,...,...,...,...
19290,2023-08-24,Summer,30.4,25.6,36.0,0.0,11.8,1014.4
19291,2023-08-25,Summer,30.8,25.6,36.7,0.0,9.0,1013.4
19292,2023-08-26,Summer,24.5,22.2,27.0,10.8,12.2,1014.5
19293,2023-08-27,Summer,22.5,19.1,26.6,0.0,12.3,1015.5


In [8]:
Jeff_Weather = core_weather.dropna()
Jeff_Weather.apply(pd.isnull).sum()

date                      0
season                    0
avg_temp_c                0
min_temp_c                0
max_temp_c                0
precipitation_mm          0
avg_wind_speed_kmh        0
avg_sea_level_pres_hpa    0
dtype: int64

In [9]:
Jeff_Weather.to_csv('Jeff_Weather.csv') # Saveing data to csv