# Author: Sandesh Basnet

# Australia Rainfall Prediction

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import jaccard_score, f1_score, log_loss, accuracy_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score

### Importing the Dataset


In [2]:
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv

In [3]:
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'

df= pd.read_csv(url)


In [4]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [5]:
df.columns

Index(['Date', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [6]:
df.dtypes

Date              object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustDir       object
WindGustSpeed      int64
WindDir9am        object
WindDir3pm        object
WindSpeed9am       int64
WindSpeed3pm       int64
Humidity9am        int64
Humidity3pm        int64
Pressure9am      float64
Pressure3pm      float64
Cloud9am           int64
Cloud3pm           int64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object

Performing one hot encoding to convert categorical variables to binary variables.


In [7]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

In [8]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

### Training Data and Test Data


In [9]:
df_sydney_processed.drop('Date', axis=1, inplace=True)

In [10]:
df_sydney_processed.sample(5)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
2779,22.3,29.3,0.0,8.0,6.9,31,15,20,75,62,...,0,0,0,0,0,0,0,0,0,0
171,7.2,15.1,0.0,3.8,10.0,41,19,22,52,31,...,0,0,0,0,0,1,0,0,0,0
316,16.7,23.8,0.0,9.8,12.1,41,24,20,39,43,...,0,0,0,1,0,0,0,0,0,0
3140,21.6,24.0,0.0,9.0,1.9,44,17,24,68,73,...,0,0,0,0,1,0,0,0,0,0
137,13.6,19.4,0.0,1.6,4.1,41,13,2,90,69,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_sydney_processed = df_sydney_processed.astype(float)

In [12]:
X = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
y = df_sydney_processed['RainTomorrow']

### Linear Regression


In [13]:
X_train, X_test, y_train, y_test =train_test_split(X, y, 
                                                   test_size= 0.2, 
                                                   random_state= 10)


In [14]:
LinearReg = LinearRegression()
LinearReg.fit(X_train, y_train)

In [15]:
predictions = LinearReg.predict(X_test)

In [16]:
LinearRegression_MAE = mean_absolute_error(y_test, predictions)
LinearRegression_MSE = mean_squared_error(y_test, predictions)
LinearRegression_R2 = r2_score(y_test, predictions)

In [17]:
Report = {
    'Metric': ['MAE', 'MSE', 'R2'],
    'Value': [LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2]
}
metrics = pd.DataFrame(Report)
metrics

Unnamed: 0,Metric,Value
0,MAE,0.256325
1,MSE,0.115723
2,R2,0.427119


### KNN


In [18]:
KNN = KNeighborsClassifier(n_neighbors= 4)
KNN.fit(X_train, y_train)

In [19]:
predictions = KNN.predict(X_test)

In [20]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)

### Decision Tree


In [21]:
Tree = DecisionTreeClassifier()
Tree.fit(X_train, y_train)

In [22]:
predictions = Tree.predict(X_test)

In [23]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)

### Logistic Regression


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size= 0.2, 
                                                    random_state= 1)

In [25]:
LR = LogisticRegression(max_iter= 10000, solver= 'liblinear')
LR.fit(X_train, y_train)

In [26]:
predictions = LR.predict(X_test)

In [27]:
predict_proba = LR.predict_proba(X_test)

In [28]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)
LR_JaccardIndex = jaccard_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions)
LR_Log_Loss = log_loss(y_test, predictions)

### SVM


In [29]:
svc = SVC()
svc.fit(X_train, y_train)

In [30]:
predictions = svc.predict(X_test)

In [31]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions)
SVM_JaccardIndex = jaccard_score(y_test, predictions)
SVM_F1_Score = f1_score(y_test, predictions)

### Report


In [32]:
Report = {
    "Model": ["Logistic Regression", "Decision Tree", "KNN", "SVM"],
    "Accuracy": [LR_Accuracy_Score, Tree_Accuracy_Score, KNN_Accuracy_Score, SVM_Accuracy_Score],
    "Jaccard Index": [LR_JaccardIndex, Tree_JaccardIndex, KNN_JaccardIndex, SVM_JaccardIndex],
    "F1-Score": [LR_F1_Score, Tree_F1_Score, KNN_F1_Score, SVM_F1_Score],
    "Log-Score": [LR_Log_Loss, 'NA', 'NA', 'NA'],
    
}
score = pd.DataFrame(Report)
score

Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,Log-Score
0,Logistic Regression,0.833588,0.5,0.666667,5.998104
1,Decision Tree,0.748092,0.388889,0.56,
2,KNN,0.818321,0.425121,0.59661,
3,SVM,0.722137,0.0,0.0,
