# Australia Rain Prediction Project #

##### By Kacper Ormaniec 

### Data source

Australian Government's Bureau of Meteorology and the latest data can be gathered from http://www.bom.gov.au/climate/dwo/.

### Importing required libraries

In [45]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [46]:
import piplite
await piplite.install(['pandas'])
await piplite.install(['numpy'])


In [47]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Importing the Dataset

In [48]:
from pyodide.http import pyfetch

async def download(url, filename):
    response = await pyfetch(url)
    if response.status == 200:
        with open(filename, "wb") as f:
            f.write(await response.bytes())

In [49]:
path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'

In [50]:
await download(path, "Weather_Data.csv")
filename ="Weather_Data.csv"

In [51]:
df = pd.read_csv("Weather_Data.csv")
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Preprocessing Data

In [52]:
#Categorical variables into binary variables
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [53]:
#Categorical column into variable column - Target column
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

### Training and Test Data

In [54]:
df_sydney_processed.drop('Date', axis=1, inplace=True)
df_sydney_processed = df_sydney_processed.astype(float)

In [55]:
#x - features, y - target 
x = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
y = df_sydney_processed['RainTomorrow']

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)

### Linear Regression

In [57]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)

LinearRegression()

In [58]:
predictions = LinearReg.predict(x_test)

In [59]:
LinearRegression_MAE = mean_absolute_error(predictions, y_test)
LinearRegression_MSE = mean_squared_error(predictions, y_test)
LinearRegression_R2 = r2_score(predictions, y_test)

In [60]:
Report_lr = {"Metrics":["MAE","MSE","R2"],"Result": 
[LinearRegression_MAE,LinearRegression_MSE,LinearRegression_R2]}
pd.DataFrame(Report)

Unnamed: 0,Metrics,Result
0,MAE,0.256319
1,MSE,0.11572
2,R2,-0.384754


### KNN 

In [61]:
KNN = KNeighborsClassifier(n_neighbors=4)
KNN.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [62]:
predictions_knn = KNN.predict(x_test)

In [63]:
KNN_Accuracy_Score = accuracy_score(predictions_knn, y_test)
KNN_JaccardIndex = jaccard_score(predictions_knn, y_test)
KNN_F1_Score = f1_score( predictions_knn, y_test)

In [64]:
report_knn = {"Metrics":["ACC","Jaccard","F1"],"Result": 
[KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score]}
pd.DataFrame(report_knn)

Unnamed: 0,Metrics,Result
0,ACC,0.818321
1,Jaccard,0.425121
2,F1,0.59661


### Decision Tree

In [65]:
Tree = DecisionTreeClassifier()
Tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [66]:
predictions =  Tree.predict(x_test)

In [67]:
Tree_Accuracy_Score = accuracy_score(predictions, y_test)
Tree_JaccardIndex = jaccard_score(predictions, y_test)
Tree_F1_Score = f1_score(predictions, y_test)

In [68]:
report_TREE = {"Metrics":["ACC","Jaccard","F1"],"Result": 
[Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score]}
pd.DataFrame(report_TREE)

Unnamed: 0,Metrics,Result
0,ACC,0.763359
1,Jaccard,0.408397
2,F1,0.579946


### Logistic Regression

In [69]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)

In [70]:
LR = LogisticRegression(solver='liblinear')
LR.fit(x_train, y_train)

LogisticRegression(solver='liblinear')

In [71]:
predictions = LR.predict(x_test)

In [72]:
LR_Accuracy_Score = accuracy_score(predictions, y_test)
LR_JaccardIndex = jaccard_score(predictions, y_test)
LR_F1_Score = f1_score(predictions, y_test)
LR_Log_Loss = log_loss(predictions, y_test)

In [73]:
report_LR = {"Metrics":["ACC","Jaccard","F1","LL"],"Result": 
[LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score,LR_Log_Loss]}
pd.DataFrame(report_LR)

Unnamed: 0,Metrics,Result
0,ACC,0.835115
1,Jaccard,0.504587
2,F1,0.670732
3,LL,5.695031


### SVM

In [74]:
SVM = svm.SVC()
SVM.fit(x_train, y_train)

SVC()

In [75]:
predictions = SVM.predict(x_test)

In [76]:
SVM_Accuracy_Score = accuracy_score(predictions, y_test)
SVM_JaccardIndex = jaccard_score(predictions, y_test)
SVM_F1_Score = f1_score(predictions, y_test)

In [77]:
report_SVM = {"Metrics":["ACC","Jaccard","F1"],"Result": 
[SVM_Accuracy_Score,SVM_JaccardIndex,SVM_F1_Score]}
pd.DataFrame(report_SVM)

Unnamed: 0,Metrics,Result
0,ACC,0.722137
1,Jaccard,0.0
2,F1,0.0


### Final Report

In [78]:
Algorithm=['KNN','Decision Tree','Logistic Regression','SVM']
Accuracy_score = [KNN_Accuracy_Score,Tree_Accuracy_Score,LR_Accuracy_Score,SVM_Accuracy_Score]
Jaccard = [KNN_JaccardIndex,Tree_JaccardIndex,LR_JaccardIndex,SVM_JaccardIndex]
F1_score = [KNN_F1_Score, Tree_F1_Score, LR_F1_Score, SVM_F1_Score]

In [79]:
Report_all = pd.DataFrame({
    "Algorithm": Algorithm,
    "Accuracy Score": Accuracy_score,
    "Jaccard": Jaccard,
    "F1-Score": F1_score,
    "LogLoss":[np.NAN,np.NAN,LR_Log_Loss,np.NAN]})

In [80]:
Report_all

Unnamed: 0,Algorithm,Accuracy Score,Jaccard,F1-Score,LogLoss
0,KNN,0.818321,0.425121,0.59661,
1,Decision Tree,0.763359,0.408397,0.579946,
2,Logistic Regression,0.835115,0.504587,0.670732,5.695031
3,SVM,0.722137,0.0,0.0,
