In [11]:
import pandas as pd
import numpy as np
import datetime as dt

In [12]:
# Number of records in the weather dataset

df_l = pd.read_csv('../data/landing/weather_raw.csv')
len(df_l)

4392

In [15]:
# Using lower case convention for the columns' names

# Dataframe of landing data
df_l = pd.read_csv('../data/landing/weather_raw.csv')
lower_cols = [c.lower() for c in df_l.columns]
new_cols = dict(zip(df_l.columns, lower_cols))

# Dataframe of raw data (renamed)
df_r = df_l.rename(columns=new_cols)
df_l.to_csv('../data/raw/weather_renamed.csv')

In [16]:
# Checking the availability of the features

# Calculating the percentage of missing data for each column
missing_percentage = df_r.isna().mean() 

# Getting columns with less than 20% of missing values
cols = missing_percentage[missing_percentage < 0.2].index
df_r[cols].columns

Index(['unnamed: 0', 'station', 'date', 'source', 'latitude', 'longitude',
       'elevation', 'name', 'report_type', 'call_sign', 'quality_control',
       'wnd', 'cig', 'vis', 'tmp', 'dew', 'slp', 'aa1', 'ga1', 'gd1', 'gf1',
       'ma1', 'rem'],
      dtype='object')

In [17]:
# Converting to datetime format and splitting into date and hour
df_r['date'] = pd.to_datetime(df_r['date'])
df_r['hour'] = df_r['date'].dt.hour

In [18]:
df_r['date'] = df_r['date'].dt.date

Since we want to have as many available instances as possible, we will choose from the mandatory data section of the dataset, which includes 6 weather factors. However, 'cig' and 'vis' are both highly skewed, so we do not want to add them into the model.

In [19]:
# Unscaling and converting missing value to NaN

# Wind speed rate
df_r['wnd'] = df_r['wnd'].apply(lambda x: int(x.split(',')[3])/10).replace(999.9, np.nan)

# Air temperature
df_r['tmp'] = df_r['tmp'].apply(lambda x: int(x.split(',')[0])/10).replace(999.9, np.nan)

# Dew point temperature
df_r['dew'] = df_r['dew'].apply(lambda x: int(x.split(',')[0])/10).replace(999.9, np.nan)

# Sea level pressure
df_r['slp'] = df_r['slp'].apply(lambda x: int(x.split(',')[0])/10).replace(9999.9, np.nan)

# Selecting all features we are interested in
df_r = df_r[['date', 'hour', 'wnd', 'tmp', 'dew', 'slp']]
df_r.ffill(inplace=True)

In [20]:
# Storing at curated folder
df_r.to_csv('../data/curated/weather_processed.csv')