In [110]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.preprocessing import StandardScaler
import numpy as np

In [111]:
df = pd.read_csv('weatherAUS.csv')

binary = ['RainToday', 'RainTomorrow']
direction = ['WindGustDir', 'WindDir9am', 'WindDir3pm']
categorical = ['Location']
numerical = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'Evaporation', 'Sunshine', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
date = ['Date'] 

windDir = {
    'N': 0, 'NNE': 22.5, 'NE': 45,
    'ENE': 67.5, 'E': 90, 'ESE': 112.5,
    'SE': 135, 'SSE': 157.5, 'S': 180, 'SSW': 202.5, 'SW': 225,
    'WSW': 247.5, 'W': 270, 'WNW': 292.5,
    'NW': 315, 'NNW': 337.5
}
def circleEncoding(direction):
    if (direction not in windDir):
        return (0, 0)
    num = windDir[direction]
    angle = np.deg2rad(num)
    return round(np.cos(angle), 3), round(np.sin(angle), 3)

columns = set(df.columns.values)
print(f"are all the columns in the dataframe: {columns == set(categorical + numerical + date)}")
print(f"columns that are not in the dataframe: {columns - set(categorical + numerical + date)}")


are all the columns in the dataframe: False
columns that are not in the dataframe: {'RainToday', 'WindDir3pm', 'WindGustDir', 'WindDir9am', 'RainTomorrow'}


In [112]:
""" Standardize the numerical columns and one-hot encode the categorical columns.
    convert the binary columns to 0 and 1. And convert the date column to datetime."""

scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])
df[numerical] = df[numerical].fillna(0)
df[numerical] = df[numerical].astype('float16')

df[binary] = df[binary].replace({'No': 0, 'Yes': 1})

# df = pd.get_dummies(df, columns=categorical)
df = df.drop(columns=categorical)

df['Date'] = pd.to_datetime(df['Date']).dt.month
df = df.rename(columns={'Date': 'Month'})

for col in direction: # Apply circle encoding and return the sin and cos values into features
    df[col + '_cos'], df[col + '_sin'] = zip(*df[col].apply(circleEncoding))
df = df.drop(columns=direction)

df = df.dropna()

display(df.info())
df.to_csv('weatherAUS_cleaned.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 140787 entries, 0 to 145458
Data columns (total 25 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Month            140787 non-null  int32  
 1   MinTemp          140787 non-null  float16
 2   MaxTemp          140787 non-null  float16
 3   Rainfall         140787 non-null  float16
 4   Evaporation      140787 non-null  float16
 5   Sunshine         140787 non-null  float16
 6   WindGustSpeed    140787 non-null  float16
 7   WindSpeed9am     140787 non-null  float16
 8   WindSpeed3pm     140787 non-null  float16
 9   Humidity9am      140787 non-null  float16
 10  Humidity3pm      140787 non-null  float16
 11  Pressure9am      140787 non-null  float16
 12  Pressure3pm      140787 non-null  float16
 13  Cloud9am         140787 non-null  float16
 14  Cloud3pm         140787 non-null  float16
 15  Temp9am          140787 non-null  float16
 16  Temp3pm          140787 non-null  float16
 

None

In [115]:
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.models import Sequential
from keras.initializers import HeNormal
from keras.regularizers import l2
from keras.callbacks import LearningRateScheduler
from keras.optimizers.schedules import ExponentialDecay
from keras.layers import Dropout
from sklearn.base import BaseEstimator, ClassifierMixin


class DenseModel(BaseEstimator, ClassifierMixin):
    def __init__(self, numClasses=10):
        self.model = Sequential([
            self.DenseLayer(1, activation='relu'),
            Dropout(0.2),
            self.DenseLayer(numClasses, activation='softmax'),
        ])
    
    # Customer Dense layer
    def DenseLayer(self, nodes, activation='relu'):
        return Dense(
            nodes, activation=activation, 
            kernel_initializer=HeNormal(), bias_initializer=HeNormal(),
            kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)
        )

    # Resets weights to HeNormal
    def reset_weights(self):
        initial_weights = self.model.get_weights()
        self.model.set_weights(initial_weights)

    # compile the model
    def compile(self):
        self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Run the model. Forward fit using a learning rate scheduler
    def fit(self, training_images, training_labels, epochs=1, batch_size=32):
        lr_scheduler = ExponentialDecay(initial_learning_rate=0.001, decay_steps=1, decay_rate=.1)
        self.model.fit(training_images, training_labels, epochs=epochs, 
                    batch_size=batch_size, callbacks=[LearningRateScheduler(lr_scheduler)])


In [118]:
# Create and train model 
from sklearn.model_selection import train_test_split

y = df['RainTomorrow']
X = df.drop(columns=['RainTomorrow'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DenseModel()
model.compile()
model.fit(X_train, y_train, epochs=1, batch_size=32)
model.reset_weights()
model.model.evaluate(X_test, y_test)

Index(['Month', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday', 'WindGustDir_cos', 'WindGustDir_sin',
       'WindDir9am_cos', 'WindDir9am_sin', 'WindDir3pm_cos', 'WindDir3pm_sin'],
      dtype='object')
