# **Pre-processing**

---

## **Import Libraries & Data Loading**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os.path as osp

In [2]:
data_folder = osp.join('.', 'dataset')

In [3]:
humidity_df = pd.read_csv(osp.join(data_folder, 'humidity.csv'))
temp_df=pd.read_csv(osp.join(data_folder,'temperature.csv'))
pressure_df = pd.read_csv(osp.join(data_folder, 'pressure.csv'))
wind_direct_df = pd.read_csv(osp.join(data_folder, 'wind_direction.csv'))
wind_speed_df = pd.read_csv(osp.join(data_folder,'wind_speed.csv'))

In [4]:
weather_descript_df = pd.read_csv(osp.join(data_folder,'weather_description.csv'))

In [5]:
city_attri_df = pd.read_csv(osp.join(data_folder,'city_attributes.csv'))

## **Metadata**

In [6]:
humidity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45253 entries, 0 to 45252
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   datetime           45253 non-null  object 
 1   Vancouver          43427 non-null  float64
 2   Portland           44804 non-null  float64
 3   San Francisco      44311 non-null  float64
 4   Seattle            44964 non-null  float64
 5   Los Angeles        45101 non-null  float64
 6   San Diego          44909 non-null  float64
 7   Las Vegas          44411 non-null  float64
 8   Phoenix            43945 non-null  float64
 9   Albuquerque        44543 non-null  float64
 10  Denver             43445 non-null  float64
 11  San Antonio        44689 non-null  float64
 12  Dallas             44934 non-null  float64
 13  Houston            45132 non-null  float64
 14  Kansas City        44741 non-null  float64
 15  Minneapolis        44743 non-null  float64
 16  Saint Louis        439

## **Merge Dataframe**

In [7]:
#Melt
humidity_melt = humidity_df.melt(id_vars='datetime',var_name='city',value_name='humidity')
temp_melt = temp_df.melt(id_vars='datetime',var_name='city',value_name='temp')
pressure_melt = pressure_df.melt(id_vars='datetime',var_name='city',value_name='pressure')
wind_direct_melt = wind_direct_df.melt(id_vars='datetime',var_name='city',value_name='wind_direct')
wind_speed_melt = wind_speed_df.melt(id_vars='datetime',var_name='city',value_name='wind_speed')
weather_descript_melt = weather_descript_df.melt(id_vars='datetime',var_name='city',value_name='weather_descript')

#Merge
weather_df = (humidity_melt
              .merge(temp_melt, on=['datetime','city'])
              .merge(pressure_melt, on=['datetime','city'])
              .merge(wind_direct_melt, on=['datetime','city'])
              .merge(wind_speed_melt, on=['datetime','city'])
              .merge(weather_descript_melt, on=['datetime','city'])
              )

print(weather_df.head())



              datetime       city  humidity        temp  pressure  \
0  2012-10-01 12:00:00  Vancouver       NaN         NaN       NaN   
1  2012-10-01 13:00:00  Vancouver      76.0  284.630000       NaN   
2  2012-10-01 14:00:00  Vancouver      76.0  284.629041       NaN   
3  2012-10-01 15:00:00  Vancouver      76.0  284.626998       NaN   
4  2012-10-01 16:00:00  Vancouver      77.0  284.624955       NaN   

   wind_direct  wind_speed weather_descript  
0          NaN         NaN              NaN  
1          0.0         0.0             mist  
2          6.0         0.0    broken clouds  
3         20.0         0.0    broken clouds  
4         34.0         0.0    broken clouds  


In [8]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1629108 entries, 0 to 1629107
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   datetime          1629108 non-null  object 
 1   city              1629108 non-null  object 
 2   humidity          1600457 non-null  float64
 3   temp              1621078 non-null  float64
 4   pressure          1612428 non-null  float64
 5   wind_direct       1621133 non-null  float64
 6   wind_speed        1621115 non-null  float64
 7   weather_descript  1621153 non-null  object 
dtypes: float64(5), object(3)
memory usage: 99.4+ MB


In [9]:
weather_df.dropna(inplace=True)

In [10]:
print(weather_df.isnull().sum())

datetime            0
city                0
humidity            0
temp                0
pressure            0
wind_direct         0
wind_speed          0
weather_descript    0
dtype: int64


In [11]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1596319 entries, 21 to 1628315
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   datetime          1596319 non-null  object 
 1   city              1596319 non-null  object 
 2   humidity          1596319 non-null  float64
 3   temp              1596319 non-null  float64
 4   pressure          1596319 non-null  float64
 5   wind_direct       1596319 non-null  float64
 6   wind_speed        1596319 non-null  float64
 7   weather_descript  1596319 non-null  object 
dtypes: float64(5), object(3)
memory usage: 109.6+ MB


In [12]:
# Merge city_attributes
weather_df=weather_df.merge(city_attri_df, left_on='city', right_on='City',how='left')

In [13]:
weather_df.drop(columns=['City'], inplace=True)

In [14]:
weather_df.head()

Unnamed: 0,datetime,city,humidity,temp,pressure,wind_direct,wind_speed,weather_descript,Country,Latitude,Longitude
0,2012-10-02 09:00:00,Vancouver,87.0,284.590217,807.0,268.0,0.0,broken clouds,Canada,49.24966,-123.119339
1,2012-10-02 10:00:00,Vancouver,88.0,284.588174,849.0,281.0,0.0,broken clouds,Canada,49.24966,-123.119339
2,2012-10-02 11:00:00,Vancouver,89.0,284.58613,890.0,295.0,0.0,broken clouds,Canada,49.24966,-123.119339
3,2012-10-02 12:00:00,Vancouver,89.0,284.584087,932.0,309.0,0.0,broken clouds,Canada,49.24966,-123.119339
4,2012-10-02 13:00:00,Vancouver,90.0,284.582043,973.0,323.0,0.0,broken clouds,Canada,49.24966,-123.119339


In [46]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596319 entries, 0 to 1596318
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   datetime          1596319 non-null  object 
 1   city              1596319 non-null  object 
 2   humidity          1596319 non-null  float64
 3   temp              1596319 non-null  float64
 4   pressure          1596319 non-null  float64
 5   wind_direct       1596319 non-null  float64
 6   wind_speed        1596319 non-null  float64
 7   weather_descript  1596319 non-null  object 
 8   Country           1596319 non-null  object 
 9   Latitude          1596319 non-null  float64
 10  Longitude         1596319 non-null  float64
dtypes: float64(7), object(4)
memory usage: 134.0+ MB


In [15]:
weather_df['datetime'] = pd.to_datetime(weather_df['datetime'])
weather_df['date'] = weather_df['datetime'].dt.date
weather_df['time'] = weather_df['datetime'].dt.time
print(weather_df[['datetime', 'date', 'time']].head())


             datetime        date      time
0 2012-10-02 09:00:00  2012-10-02  09:00:00
1 2012-10-02 10:00:00  2012-10-02  10:00:00
2 2012-10-02 11:00:00  2012-10-02  11:00:00
3 2012-10-02 12:00:00  2012-10-02  12:00:00
4 2012-10-02 13:00:00  2012-10-02  13:00:00


In [16]:
weather_df.drop(columns=['datetime'], inplace=True)

In [17]:
weather_df.head()

Unnamed: 0,city,humidity,temp,pressure,wind_direct,wind_speed,weather_descript,Country,Latitude,Longitude,date,time
0,Vancouver,87.0,284.590217,807.0,268.0,0.0,broken clouds,Canada,49.24966,-123.119339,2012-10-02,09:00:00
1,Vancouver,88.0,284.588174,849.0,281.0,0.0,broken clouds,Canada,49.24966,-123.119339,2012-10-02,10:00:00
2,Vancouver,89.0,284.58613,890.0,295.0,0.0,broken clouds,Canada,49.24966,-123.119339,2012-10-02,11:00:00
3,Vancouver,89.0,284.584087,932.0,309.0,0.0,broken clouds,Canada,49.24966,-123.119339,2012-10-02,12:00:00
4,Vancouver,90.0,284.582043,973.0,323.0,0.0,broken clouds,Canada,49.24966,-123.119339,2012-10-02,13:00:00
