# Rain Prediction in Australia

In [13]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
import requests as re

In [25]:
def download(url, file_name):
    response = re.get(url)
    if response.status_code == 200:
        with open(file_name,'wb') as file:
            file.write(response.content)

In [29]:
URL = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'
FILE_NAME = 'weather_data.csv'
download(URL, FILE_NAME)

In [39]:
df = pd.read_csv(FILE_NAME)
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


## Preprocessing

### One Hot Encoding

In [67]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

### Categorizing target variable

In [69]:
df_sydney_processed['RainTomorrow'] = np.where(df['RainTomorrow'] == 'Yes', 1, 0)

## Splitting dataset

In [79]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [83]:
df_sydney_processed = df_sydney_processed.astype(float)

In [93]:
df_sydney_processed.dtypes

MinTemp           float64
MaxTemp           float64
Rainfall          float64
Evaporation       float64
Sunshine          float64
                   ...   
WindDir3pm_SSW    float64
WindDir3pm_SW     float64
WindDir3pm_W      float64
WindDir3pm_WNW    float64
WindDir3pm_WSW    float64
Length: 67, dtype: object

In [97]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis = 1)
Y = df_sydney_processed['RainTomorrow']

In [99]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=10)

# Linear Regression

In [102]:
LinearReg = LinearRegression()

In [104]:
LinearReg.fit(X_train, y_train)

In [108]:
predictions = LinearReg.predict(X_test)

In [110]:
LinearRegression_MAE = metrics.mean_absolute_error(y_test, predictions)
LinearRegression_MSE = metrics.mean_squared_error(y_test, predictions)
LinearRegression_R2 = metrics.r2_score(y_test, predictions)

In [118]:
report = pd.DataFrame({'Metric':['MAE','MSE','R2'],'Value':[LinearRegression_MAE,LinearRegression_MSE,LinearRegression_R2]})

In [120]:
report

Unnamed: 0,Metric,Value
0,MAE,0.256319
1,MSE,0.115723
2,R2,0.427121


# KNN

In [123]:
KNN = KNeighborsClassifier(n_neighbors=5)

In [125]:
KNN.fit(X_train,y_train)

In [129]:
predictions = KNN.predict(X_test)

In [131]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = accuracy_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)

In [133]:
KNN_report = pd.DataFrame({'Metric':['AS','JI','F1'],'Value':[KNN_Accuracy_Score,KNN_JaccardIndex,KNN_F1_Score]})

In [135]:
KNN_report

Unnamed: 0,Metric,Value
0,AS,0.819847
1,JI,0.466063
2,F1,0.635802


# Decision Tree

In [138]:
Tree = DecisionTreeClassifier()

In [140]:
Tree.fit(X_train, y_train)

In [142]:
predictions = Tree.predict(X_test)

In [144]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = accuracy_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)

In [146]:
Tree_report = pd.DataFrame({'Metric':['AS','JI','F1'],'Value':[Tree_Accuracy_Score,Tree_JaccardIndex,Tree_F1_Score]})

In [148]:
Tree_report

Unnamed: 0,Metric,Value
0,AS,0.754198
1,JI,0.754198
2,F1,0.558904


# Logistic Regression

In [151]:
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size=0.2, random_state=1)

In [153]:
LR = LogisticRegression(solver='liblinear')

In [155]:
LR.fit(X_train, y_train)

In [157]:
predictions = LR.predict(X_test)

In [161]:
predict_prob = LR.predict_proba(X_test)

In [171]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)
LR_JaccardIndex = accuracy_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions)
LR_Log_Loss = log_loss(y_test, predictions)

In [173]:
LR_report = pd.DataFrame({'Metric':['AS','JI','F1','LL'],'Value':[LR_Accuracy_Score,LR_JaccardIndex,LR_F1_Score,LR_Log_Loss]})

In [175]:
LR_report

Unnamed: 0,Metric,Value
0,AS,0.836641
1,JI,0.836641
2,F1,0.674772
3,LL,5.888047


# SVM

In [178]:
SVM = svm.SVC()

In [180]:
SVM.fit(X_train, y_train)

In [182]:
predictions = SVM.predict(X_test)

In [184]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions)
SVM_JaccardIndex = accuracy_score(y_test, predictions)
SVM_F1_Score = f1_score(y_test, predictions)

In [193]:
SVM_report = pd.DataFrame({'Metric':['AS','JI','F1'],'Value':[SVM_Accuracy_Score,SVM_JaccardIndex,SVM_F1_Score]})
SVM_report

Unnamed: 0,Metric,Value
0,AS,0.722137
1,JI,0.722137
2,F1,0.0


# Overall Report

In [195]:
Report = pd.DataFrame({
    'Model':['KNN','Decision Tree','Logistic Regression','SVM'],
    'Accuracy':[KNN_Accuracy_Score,Tree_Accuracy_Score,LR_Accuracy_Score,SVM_Accuracy_Score],
    'Jaccard Index':[KNN_JaccardIndex, Tree_JaccardIndex, LR_JaccardIndex, SVM_JaccardIndex],
    'F1-Score':[KNN_F1_Score,Tree_F1_Score,LR_F1_Score,SVM_F1_Score],
    'LogLoss':[np.nan, np.nan, LR_Log_Loss, np.nan]})

In [197]:
Report

Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,LogLoss
0,KNN,0.819847,0.466063,0.635802,
1,Decision Tree,0.754198,0.754198,0.558904,
2,Logistic Regression,0.836641,0.836641,0.674772,5.888047
3,SVM,0.722137,0.722137,0.0,
