## Import packages

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import re

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
from sklearn.utils import shuffle

# Explore data

* Read file `weather_hist.xlsx`

In [2]:
df = pd.read_excel('./data/weather_hist.xlsx')
df.head(5)

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip.,condition
0,2021-07-01,12:00 AM,81,79,94,WSW,6,0,29.76,0.0,Partly Cloudy
1,2021-07-01,12:30 AM,81,79,94,WSW,7,0,29.76,0.0,Partly Cloudy
2,2021-07-01,1:00 AM,82,79,89,SW,6,0,29.76,0.0,Fair
3,2021-07-01,1:30 AM,81,79,94,SW,6,0,29.76,0.0,Fair
4,2021-07-01,2:00 AM,81,79,94,SSW,7,0,29.73,0.0,Fair


* Data shape (rows, columns)

In [3]:
df.shape

(3162, 11)

### Are there any duplicate 

In [4]:
num_duplicate_rows = df.index.duplicated(keep='first').sum()
num_duplicate_rows

0

### Are there any missing values

In [5]:
missing_values = df.isna().sum()
missing_values

date           0
time           0
temperature    0
dew_point      0
humidity       0
wind           0
wind_speed     0
wind_gust      0
pressure       0
precip.        0
condition      0
dtype: int64

### What is the meaning of each row?

Each row is the weather condition at a determined time of the date, which is collected every 30 minutes.

### What is the meaning of each column?

There are 11 columns:

* `date`: date that data is collected (YYYY-MM-DD)
* `time`: 12:00 AM -> 11:30 PM
* `temperature`: Fahrenheit
* `dew_point`: the atmospheric temperature (varying according to pressure and humidity) below which water droplets begin to condense and dew can form. (Fahrenheit)
* `humidity`: atmospheric moisture (percentage)
* `wind`: wind's direction code
* `wind_speed`: the rate at which the wind passes a given point (mph : miles per hour)
* `wind_gust`: a brief increase in the speed of the wind (mph)
* `pressure`: sea level pressure (inches Hg)
* `precip.`: any liquid or frozen water that forms in the atmosphere and falls to the Earth (inches)
* `condition`: description of the weather

### What are the data types of these columns?

In [6]:
df.dtypes

date            object
time            object
temperature      int64
dew_point        int64
humidity         int64
wind            object
wind_speed       int64
wind_gust        int64
pressure       float64
precip.        float64
condition       object
dtype: object

In [7]:
num_cols = list(df.select_dtypes(exclude='object').columns)
cate_cols = list(df.select_dtypes(include='object').columns)

# A question

Predict whether it will rain or not in the following 30 minutes based on data about weather conditions of that time.

This prediction is helpful because it helps us be proactive in preparation to avoid being wet when going out, brings an umbrella or wearing a raincoat.

# Preprocess Data

Since the current records are weather conditions and corresponding weather labels, in order to serve the future weather prediction problem based on current information, we decided to shift the value of the " condition" column up based on the "time" and "date" columns. Each new row of data will be the current weather condition and the "condition" column value will be the data of the next 30 minutes.

* We will check each row of the "time" and "date" columns to be continuous, eliminating the non-continuous rows since they don't have the corresponding "condition" column data. Then shift the values.

In [8]:
df.head(5)

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip.,condition
0,2021-07-01,12:00 AM,81,79,94,WSW,6,0,29.76,0.0,Partly Cloudy
1,2021-07-01,12:30 AM,81,79,94,WSW,7,0,29.76,0.0,Partly Cloudy
2,2021-07-01,1:00 AM,82,79,89,SW,6,0,29.76,0.0,Fair
3,2021-07-01,1:30 AM,81,79,94,SW,6,0,29.76,0.0,Fair
4,2021-07-01,2:00 AM,81,79,94,SSW,7,0,29.73,0.0,Fair


In [9]:
def CheckDate(date_df):
    max_day = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    days=[0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    for i in range(1, len(days)):
        days[i] += days[i - 1]
    previous, pyear, pday, pmonth = None, None, None, None
    for index, value in date_df.items():
        ymd = value.split('-')
        year, month, day = int(ymd[0]), int(ymd[1]), int(ymd[2])
        now = (year-1) * 365 + days[month - 1] + day
        if index != 0:
            if now != previous:
                if month == 2 and days == 29:
                    if year % 4 != 0 or pyear != year or pmonth != month or pday != 28:
                        return False
                if month > 12 or day > max_day[month]:
                    return False
                if now - previous != 1:
                    return False
        previous, pyear, pmonth, pday = now, year, month, day
    return True

CheckDate(df['date'])

True

In [10]:
def CheckTime(time_df):
    previous, pdn = None, None
    for index, value in time_df.items():
        dn = value[-2:]
        hm = value[:-3].split(':')
        hour, minute = int(hm[0]) % 12, int(hm[1]) % 60
        now = hour * 60 + minute
        if index != 0:
            if dn == pdn:
                if now - previous != 30:
                    print(index)
                    return False
            else:
                if (hour + 12) * 60 + minute - previous != 30:
                    print(index)
                    return False
        previous, pdn = now, dn
    return True

CheckTime(df.time)

1247


False

In [11]:
condition_temp = df.condition.drop(index=[0], inplace = False)
condition_temp.reset_index(drop = True, inplace = True)
print(condition_temp.shape)
df.drop(index = df.shape[0] - 1, inplace = True)
df['condition'] = condition_temp

(3161,)


In [12]:
df['condition'].head(18)

0     Partly Cloudy
1              Fair
2              Fair
3              Fair
4              Fair
5              Fair
6              Fair
7              Fair
8              Fair
9              Fair
10             Fair
11             Fair
12             Fair
13             Fair
14             Fair
15             Fair
16    Partly Cloudy
17    Partly Cloudy
Name: condition, dtype: object

In [13]:
def CheckDateTime(time_df):
    ans = np.ones(time_df.shape[0])
    previous, pdn = None, None
    for index, value in time_df.items():
        dn = value[-2:]
        hm = value[:-3].split(':')
        hour, minute = int(hm[0]) % 12, int(hm[1]) % 60
        now = hour * 60 + minute
        if index != 0:
            if dn == pdn:
                if now - previous != 30:
                    ans[index] = 0
            else:
                if (hour + 12) * 60 + minute - previous != 30:
                    ans[index] = 0
        previous, pdn = now, dn
    return ans

ans = CheckDateTime(df['time'])
df.drop(np.where(ans==0)[0], axis=0, inplace=True)
df.reset_index(inplace=True)
df.drop(columns='index',inplace=True)

In [14]:
df.head(5)

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip.,condition
0,2021-07-01,12:00 AM,81,79,94,WSW,6,0,29.76,0.0,Partly Cloudy
1,2021-07-01,12:30 AM,81,79,94,WSW,7,0,29.76,0.0,Fair
2,2021-07-01,1:00 AM,82,79,89,SW,6,0,29.76,0.0,Fair
3,2021-07-01,1:30 AM,81,79,94,SW,6,0,29.76,0.0,Fair
4,2021-07-01,2:00 AM,81,79,94,SSW,7,0,29.73,0.0,Fair


# Split data into training, validation and testing sets

* We shuffle data

In [15]:
df_shuffle = shuffle(df, random_state=4)
df_shuffle.head(5)

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip.,condition
573,2021-07-12,10:30 PM,81,77,89,VAR,3,0,29.79,0.0,Partly Cloudy
1875,2021-08-09,4:30 AM,82,79,89,SW,7,0,29.73,0.0,Partly Cloudy
2966,2021-08-31,11:00 PM,79,77,94,CALM,0,0,29.79,0.0,Partly Cloudy
2791,2021-08-28,7:30 AM,79,77,94,CALM,0,0,29.73,0.0,Partly Cloudy
2056,2021-08-12,11:00 PM,82,77,84,SSW,5,0,29.82,0.0,Partly Cloudy


* Separate `condition` column from data

In [16]:
y_sr = df_shuffle['condition']
X_df = df_shuffle.drop(columns='condition')

* We split data into training and testing set with $80\%$, $20\%$ respectively

In [17]:
X_df, test_X_df, y_sr, test_y_sr = train_test_split(X_df, y_sr, test_size=0.2, random_state=0)

In [18]:
X_df.shape

(2524, 10)

In [19]:
y_sr.shape

(2524,)

In [20]:
test_X_df.shape

(631, 10)

In [21]:
test_y_sr.shape

(631,)

* Next, we split training data into training and validation set

In [22]:
train_X_df, val_X_df, train_y_sr, val_y_sr = train_test_split(X_df, y_sr, test_size=0.25, random_state=0)

In [23]:
train_X_df.shape

(1893, 10)

In [24]:
train_y_sr.shape

(1893,)

In [25]:
val_X_df.shape

(631, 10)

In [26]:
val_y_sr.shape

(631,)

* Check the distribution of labels

In [27]:
train_y_sr.value_counts()

Partly Cloudy                1043
Mostly Cloudy                 526
Fair                          108
Light Rain                     95
Light Rain Shower              42
Light Rain with Thunder        23
T-Storm                        10
Thunder in the Vicinity        10
Showers in the Vicinity         4
Fog                             4
Thunder                         4
Heavy T-Storm / Windy           4
Heavy T-Storm                   4
Light Rain Shower / Windy       3
Rain Shower                     3
Partly Cloudy / Windy           3
Heavy Rain Shower               2
Mostly Cloudy / Windy           2
Heavy Rain Shower / Windy       2
Rain Shower / Windy             1
Name: condition, dtype: int64

In [28]:
val_y_sr.value_counts()

Partly Cloudy                323
Mostly Cloudy                195
Fair                          36
Light Rain                    31
Light Rain Shower             14
Light Rain with Thunder        7
T-Storm                        5
Thunder in the Vicinity        5
Heavy T-Storm                  4
Showers in the Vicinity        3
Rain Shower                    2
Partly Cloudy / Windy          2
Light Rain Shower / Windy      1
Thunder                        1
Fog                            1
Rain Shower / Windy            1
Name: condition, dtype: int64

* Save training, validation, testing data

In [29]:
pd.concat([train_X_df, train_y_sr], axis=1).to_csv('data/training.csv', index=False)
pd.concat([val_X_df, val_y_sr], axis=1).to_csv('data/validation.csv', index=False)
pd.concat([test_X_df, test_y_sr], axis=1).to_csv('data/testing.csv', index=False)