# Australian rain forecast using SVM algorithm

## import packages

In [1]:
import numpy as np
import pandas as pd

## read the csv file and show a few top rows

In [2]:
df = pd.read_csv("D:\\1. Data\\OneDrive\\OneDrive - University of Calgary\\PhD\\Datasets\\Weather Australia\\weatherAUS.csv")
df.head(10)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
5,2008-12-06,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,No
6,2008-12-07,Albury,14.3,25.0,0.0,,,W,50.0,SW,...,49.0,19.0,1009.6,1008.2,1.0,,18.1,24.6,No,No
7,2008-12-08,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,No
8,2008-12-09,Albury,9.7,31.9,0.0,,,NNW,80.0,SE,...,42.0,9.0,1008.9,1003.6,,,18.3,30.2,No,Yes
9,2008-12-10,Albury,13.1,30.1,1.4,,,W,28.0,S,...,58.0,27.0,1007.0,1005.7,,,20.1,28.2,Yes,No


## 1. Drop Nan values in target column

In [3]:
df2 = df.dropna(axis=0,subset=['RainTomorrow'])
# df2.isnull().sum()

## 2. Break input data frame

1- Separate X and y values

2- split training, validation, and test sets with the share of 60%, 20%, and 20% respectively.

In [4]:
from sklearn.model_selection import train_test_split

x_in = df2.copy()
y_in = x_in.pop('RainTomorrow')

x_trnval, x_tst, y_trnval, y_tst = train_test_split(x_in, y_in, test_size=0.2, shuffle=True)
# to have the same number for validation as test, the following portion will be 0.25 (25% of the remaining 80%)
x_trn, x_val, y_trn, y_val = train_test_split(x_trnval, y_trnval, test_size=0.25, shuffle=True)

### Convert Date column from object data type into number data type

In [5]:
# from datetime import datetime
# datetime.strptime(x_trn.Date[0], '%Y-%m-%d')
def date_to_days(date_in):
    '''
    This function get a dataseries containing a date in str format
    Then return the number of days after year 2007 because we know that the database year start after 2007
    '2007-02-03'=(2007-2007)*365 + dayofyear('02-03')
    Note: we assume all years are 365 days for simplicity
    '''
    yrs_day = (pd.to_datetime(date_in).dt.year - 2007) * 365
    day_num = pd.to_datetime(date_in).dt.dayofyear
    
    return yrs_day + day_num


x_trn = x_trn.assign(Date=date_to_days(x_trn.Date))
x_val = x_val.assign(Date=date_to_days(x_val.Date))
x_tst = x_tst.assign(Date=date_to_days(x_tst.Date))

### Convert target column into int data type

In [11]:
# Targets are yes/no (object data type)
y_in.unique()

array(['No', 'Yes'], dtype=object)

## 3. Impute nan values
find the object and number type columns

In [6]:
col_obj = x_trn.select_dtypes('object').columns.tolist()
col_num = x_trn.select_dtypes('number').columns.tolist()

### 3.1. Simple Imputer

In [7]:
from sklearn.impute import SimpleImputer

si_obj = SimpleImputer(strategy='most_frequent')
si_num = SimpleImputer(strategy='mean')

x_trn_si = x_trn.copy()
x_val_si = x_val.copy()
x_tst_si = x_tst.copy()

x_trn_si.loc[:,col_obj] = si_obj.fit_transform(x_trn[col_obj])
x_trn_si.loc[:,col_num] = si_num.fit_transform(x_trn[col_num])

x_val_si.loc[:,col_obj] = si_obj.transform(x_val[col_obj])
x_val_si.loc[:,col_num] = si_num.transform(x_val[col_num])

x_tst_si.loc[:,col_obj] = si_obj.transform(x_tst[col_obj])
x_tst_si.loc[:,col_num] = si_num.transform(x_tst[col_num])

### 3.2. Iterative Imputer

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

ii_num = IterativeImputer(max_iter=20, tol=0.1, n_nearest_features=4, initial_strategy='mean')
# IterativeImputer does not work with object (categorical) data types
# !!!!! need further investigation
ii_obj = SimpleImputer(strategy='most_frequent') # IterativeImputer(max_iter=20, tol=1, n_nearest_features=5, initial_strategy='most_frequent')

x_trn_ii = x_trn.copy()
x_val_ii = x_val.copy()
x_tst_ii = x_tst.copy()

x_trn_ii.loc[:,col_num] = ii_num.fit_transform(x_trn[col_num])
x_trn_ii.loc[:,col_obj] = ii_obj.fit_transform(x_trn[col_obj])

x_val_ii.loc[:,col_num] = ii_num.transform(x_val[col_num])
x_val_ii.loc[:,col_obj] = ii_obj.transform(x_val[col_obj])

x_tst_ii.loc[:,col_num] = ii_num.transform(x_tst[col_num])
x_tst_ii.loc[:,col_obj] = ii_obj.transform(x_tst[col_obj])

### 3.3. KNN Imputer

In [9]:
from sklearn.impute import KNNImputer

ki_num = KNNImputer(n_neighbors=5, weights='uniform')
# KNNImputer does not work with object (categorical) data types
# !!!!! need further investigation
ki_obj = SimpleImputer(strategy='most_frequent')

x_trn_ki = x_trn.copy()
x_val_ki = x_val.copy()
x_tst_ki = x_tst.copy()

x_trn_ki.loc[:,col_num] = ki_num.fit_transform(x_trn[col_num])
x_trn_ki.loc[:,col_obj] = ki_obj.fit_transform(x_trn[col_obj])

x_val_ki.loc[:,col_num] = ki_num.transform(x_val[col_num])
x_val_ki.loc[:,col_obj] = ki_obj.transform(x_val[col_obj])

x_tst_ki.loc[:,col_num] = ki_num.transform(x_tst[col_num])
x_tst_ki.loc[:,col_obj] = ki_obj.transform(x_tst[col_obj])

## 4. Convert object columns into number format

In [30]:
from sklearn.preprocessing import LabelEncoder

le_si = LabelEncoder()
le_ii = LabelEncoder()
le_ki = LabelEncoder()
le_y  = LabelEncoder()

X_train_si = x_trn_si.copy()
X_train_ii = x_trn_ii.copy()
X_train_ki = x_trn_ki.copy()

X_valid_si = x_val_si.copy()
X_valid_ii = x_val_ii.copy()
X_valid_ki = x_val_ki.copy()

X_test_si  = x_tst_si.copy()
X_test_ii  = x_tst_ii.copy()
X_test_ki  = x_tst_ki.copy()

for col in col_obj:
    X_train_si.loc[:,col] = le_si.fit_transform(x_trn_si[col])
    X_train_ii.loc[:,col] = le_ii.fit_transform(x_trn_ii[col])
    X_train_ki.loc[:,col] = le_ki.fit_transform(x_trn_ki[col])
    
    X_valid_si.loc[:,col] = le_si.transform(x_val_si[col])
    X_valid_ii.loc[:,col] = le_ii.transform(x_val_ii[col])
    X_valid_ki.loc[:,col] = le_ki.transform(x_val_ki[col])
    
    X_test_si.loc[:,col] = le_si.transform(x_tst_si[col])
    X_test_ii.loc[:,col] = le_ii.transform(x_tst_ii[col])
    X_test_ki.loc[:,col] = le_ki.transform(x_tst_ki[col])
    
y_train = le_y.fit_transform(y_trn)
y_valid = le_y.transform(y_val)
y_test  = le_y.transform(y_tst)

# 5. SVM model

### 5.1 SVC

In [32]:
# Support Vector Classification
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_error

# Polynomial with max degree of 4 and One-vs-Rest
svc_model = SVC(C=0.5, kernel='poly', degree=4, coef0=0, tol=0.5, max_iter=300, decision_function_shape='ovr')

svc_model.fit(X_train_si, y_train)
svc_model.fit(X_train_ii, y_train)
svc_model.fit(X_train_ki, y_train)

y_train_si_pred = svc_model.predict(X_train_si)
y_train_ii_pred = svc_model.predict(X_train_ii)
y_train_ki_pred = svc_model.predict(X_train_ki)
y_valid_si_pred = svc_model.predict(X_valid_si)
y_valid_ii_pred = svc_model.predict(X_valid_ii)
y_valid_ki_pred = svc_model.predict(X_valid_ki)
y_test_si_pred = svc_model.predict(X_test_si)
y_test_ii_pred = svc_model.predict(X_test_ii)
y_test_ki_pred = svc_model.predict(X_test_ki)

score_train_si = mean_absolute_error(y_train, y_train_si_pred)
score_train_ii = mean_absolute_error(y_train, y_train_ii_pred)
score_train_ki = mean_absolute_error(y_train, y_train_ki_pred)

score_valid_si = mean_absolute_error(y_valid, y_valid_si_pred)
score_valid_ii = mean_absolute_error(y_valid, y_valid_ii_pred)
score_valid_ki = mean_absolute_error(y_valid, y_valid_ki_pred)

score_test_si = mean_absolute_error(y_test, y_test_si_pred)
score_test_ii = mean_absolute_error(y_test, y_test_ii_pred)
score_test_ki = mean_absolute_error(y_test, y_test_ki_pred)



In [35]:
print('score_train_si=', score_train_si)
print('score_train_ii=', score_train_ii)
print('score_train_ki=', score_train_ki)
print('score_valid_si=', score_valid_si)
print('score_valid_ii=', score_valid_ii)
print('score_valid_ki=', score_valid_ki)
print('score_test_si=', score_test_si)
print('score_test_ii=', score_test_ii)
print('score_test_ki=', score_test_ki)

score_train_si= 0.6725077653402098
score_train_ii= 0.6720037508058372
score_train_ki= 0.6714645724667408
score_valid_si= 0.6744259643447379
score_valid_ii= 0.673968845599353
score_valid_ki= 0.673652378775625
score_test_si= 0.6706283624600021
score_test_ii= 0.6701009177537888
score_test_ki= 0.6700657547733746
