# Weather data preprocessing

In [91]:
import pandas as pd
from matplotlib import pyplot
import seaborn as sn
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import datetime as dt
import numpy as np

In [92]:
import warnings
warnings.filterwarnings("ignore") #:D

### files to load

In [93]:
# GLOBAL to change 
file_name = "data_for_main_model/data_lodz_main_data.csv"
df_name = "data_distance_from_Lodz/data_ready_for_training.csv"
df_name_update = "data_distance_from_Lodz/data_ready_for_update.csv"

# # for city 2022 (not Lodz)
# file_name = "data_distance_from_Lodz/Lodz.csv"
# city_name = "data_distance_from_Lodz/Lodz_2022.csv"

In [94]:
# Read the CSV file
all_data = pd.read_csv(file_name)

In [95]:
all_data.head()

Unnamed: 0,station,valid,tmpc,relh,sknt
0,EPLL,2022-01-01 00:00,10.0,93.5,11.0
1,EPLL,2022-01-01 00:30,10.0,93.5,12.0
2,EPLL,2022-01-01 01:00,10.0,93.5,13.0
3,EPLL,2022-01-01 01:30,10.0,93.5,13.0
4,EPLL,2022-01-01 02:00,10.0,93.5,13.0


In [96]:
all_data.tail()

Unnamed: 0,station,valid,tmpc,relh,sknt
17455,EPLL,2022-12-30 21:30,4.0,86.79,4.0
17456,EPLL,2022-12-30 22:00,4.0,86.79,5.0
17457,EPLL,2022-12-30 22:30,3.0,93.14,4.0
17458,EPLL,2022-12-30 23:00,4.0,86.79,6.0
17459,EPLL,2022-12-30 23:30,4.0,86.79,6.0


### Note - columns:

valid:
timestamp of the observation - every half hour

tmpc:
Air Temperature in Celsius

relh:
Relative Humidity in %

sknt:
Wind Speed in knots

In [97]:
df = all_data[["valid", "tmpc", "relh", "sknt"]]

#empty rows deletion
df=df.replace('M', pd.np.nan)

df.isna().mean()

valid    0.0
tmpc     0.0
relh     0.0
sknt     0.0
dtype: float64

### If NAN then:
### - the same data as in previous step in 'relh' column (because there are sequences of nan values)
### - mean from previous and next steps in 'tmpc' and 'sknt'

In [98]:
df['tmpc'] = df['tmpc'].astype(float)
df['relh'] = df['relh'].astype(float)
df['sknt'] = df['sknt'].astype(float)


# 'tmpf' NaN into data from previous timestamp
bool_df = df['tmpc'].isnull()
indexes = df[bool_df].index
for i in indexes:
    df['tmpc'][i]=(df['tmpc'][i-1] + df['tmpc'][i+1])/2

# 'tmpf' NaN into data from previous timestamp
bool_df = df['relh'].isnull()
indexes = df[bool_df].index
for i in indexes:
    df['relh'][i]=df['relh'][i-1]

# 'tmpf' NaN into data from previous timestamp
bool_df = df['sknt'].isnull()
indexes = df[bool_df].index
for i in indexes:
    df['sknt'][i]=(df['sknt'][i-1] + df['sknt'][i+1])/2

df.isna().mean()

valid    0.0
tmpc     0.0
relh     0.0
sknt     0.0
dtype: float64

In [99]:
dft = df
dft.rename(columns = {'tmpc':'temp'}, inplace = True)
# knt => kph
knots_into_kmph = 1.852 
dft['skph'] = dft['sknt'] * knots_into_kmph
dft.drop('sknt',axis='columns', inplace=True)

# date
dft['valid']=pd.to_datetime(dft['valid'])
dft['day'] = dft['valid'].dt.day
dft['month'] = dft['valid'].dt.month
dft['year'] = dft['valid'].dt.year
dft['time'] = dft['valid'].dt.time

# to choose only full hours in the future
dft['minutes'] = dft['valid'].dt.minute

dft.drop('valid',axis='columns', inplace=True)

dft.head(10)

Unnamed: 0,temp,relh,skph,day,month,year,time,minutes
0,10.0,93.5,20.372,1,1,2022,00:00:00,0
1,10.0,93.5,22.224,1,1,2022,00:30:00,30
2,10.0,93.5,24.076,1,1,2022,01:00:00,0
3,10.0,93.5,24.076,1,1,2022,01:30:00,30
4,10.0,93.5,24.076,1,1,2022,02:00:00,0
5,10.0,93.5,25.928,1,1,2022,02:30:00,30
6,10.0,93.5,25.928,1,1,2022,03:00:00,0
7,10.0,93.5,24.076,1,1,2022,03:30:00,30
8,10.0,93.5,24.076,1,1,2022,04:00:00,0
9,10.0,93.5,14.816,1,1,2022,04:30:00,30


# Only for main data till the next huge section "Data from different cities preprocessing - uncomment if in use"

### Checking the number of rows - is there any missing data?

In [100]:
# only for Lodz data

for i in range(2010, 2023):
    print(i)
    print(dft[dft['year'] == i].shape)

### 2010 and 2011 drop

In [101]:
# also only for Lodz data

dft.drop(dft[dft['year'] == 2010].index, inplace = True)
dft.drop(dft[dft['year'] == 2011].index, inplace = True)

In [102]:
window_size = 6
percentage_of_data_learn = 0.75
size = (int)(percentage_of_data_learn * len(dft))
df_train_and_test = dft.iloc[:size, :]
df_update = dft.iloc[size:, :]

In [103]:
df_train_and_test

In [104]:
df_train_and_test.to_csv(df_name)
df_update.to_csv(df_name_update)

# Data from different cities preprocessing - uncomment if in use

In [105]:
# dft.to_csv(city_name)