## **Import the required libraries**


In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### Importing the Dataset


In [3]:
df = pd.read_csv("Weather_Data.csv")
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Data Preprocessing


In [4]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

Next, we replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. We do not use the `get_dummies` method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.


In [5]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

### Training Data and Test Data


In [6]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [7]:
df_sydney_processed = df_sydney_processed.astype(float)

In [8]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

### Linear Regression


In [9]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 10)

In [10]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)

In [11]:
predictions = LinearReg.predict(x_test)

In [13]:
LinearRegression_MAE = mean_absolute_error(y_test, predictions)
LinearRegression_MSE = mean_squared_error(y_test, predictions)
LinearRegression_R2 = LinearReg.score(x_test, y_test)

In [33]:
data = {
    'Metric': ['MAE', 'MSE', 'R2'],
    'Value': [LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2]
}

Report = pd.DataFrame(data)
print(Report)

  Metric     Value
0    MAE  0.256318
1    MSE  0.115719
2     R2  0.427138


### KNN


In [17]:
knn = KNeighborsClassifier(n_neighbors=4) 
knn.fit(x_train, y_train)

In [18]:
predictions = knn.predict(x_test)

In [19]:
KNN_Accuracy_Score = accuracy_score(y_test, predictions)
KNN_JaccardIndex = jaccard_score(y_test, predictions)
KNN_F1_Score = f1_score(y_test, predictions)


print(KNN_Accuracy_Score)
print(KNN_JaccardIndex)
print(KNN_F1_Score)

0.8183206106870229
0.4251207729468599
0.5966101694915255


### Decision Tree


In [20]:
Tree = DecisionTreeClassifier()

# Train the model
Tree.fit(x_train, y_train)

In [21]:
predictions = Tree.predict(x_test)

In [22]:
Tree_Accuracy_Score = accuracy_score(y_test, predictions)
Tree_JaccardIndex = jaccard_score(y_test, predictions)
Tree_F1_Score = f1_score(y_test, predictions)


print(Tree_Accuracy_Score)
print(Tree_JaccardIndex)
print(Tree_F1_Score)
                    

0.7618320610687023
0.4068441064638783
0.5783783783783784


### Logistic Regression


In [23]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size = 0.2, random_state = 1)

In [24]:
LR = LogisticRegression(solver='liblinear')

LR.fit(x_train, y_train)

In [25]:
predictions = LR.predict(x_test)

In [26]:
predict_proba = LR.predict_proba(x_test)

In [27]:
LR_Accuracy_Score = accuracy_score(y_test, predictions)
LR_JaccardIndex = jaccard_score(y_test, predictions)
LR_F1_Score = f1_score(y_test, predictions)
LR_Log_Loss = log_loss(y_test, predict_proba)


print(LR_Accuracy_Score)
print(LR_JaccardIndex)
print(LR_F1_Score)
print(LR_Log_Loss)

0.8366412213740458
0.5091743119266054
0.6747720364741641
0.3808204103167027


### SVM


In [28]:
from sklearn.svm import SVC

In [29]:
SVM = SVC()

# Train the model
SVM.fit(x_train, y_train)

In [30]:
predictions = SVM.predict(x_test)

In [31]:
SVM_Accuracy_Score = accuracy_score(y_test, predictions)
SVM_JaccardIndex = jaccard_score(y_test, predictions)
SVM_F1_Score = f1_score(y_test, predictions)

print(SVM_Accuracy_Score)
print(SVM_JaccardIndex)
print(SVM_F1_Score)

0.7221374045801526
0.0
0.0


### Report


In [32]:
data = {
    'Model': ['KNN', 'Logistic Regression', 'Decision Tree', 'SVM'],
    'Accuracy': [KNN_Accuracy_Score, LR_Accuracy_Score, Tree_Accuracy_Score, SVM_Accuracy_Score],
    'Jaccard Index': [KNN_JaccardIndex, LR_JaccardIndex, Tree_JaccardIndex, SVM_JaccardIndex],
    'F1-Score': [KNN_F1_Score, LR_F1_Score, Tree_F1_Score, SVM_F1_Score],
    'Log Loss': [LR_Log_Loss, 0, 0, 0]
}

Report = pd.DataFrame(data)

Report  

Unnamed: 0,Model,Accuracy,Jaccard Index,F1-Score,Log Loss
0,KNN,0.818321,0.425121,0.59661,0.38082
1,Logistic Regression,0.836641,0.509174,0.674772,0.0
2,Decision Tree,0.761832,0.406844,0.578378,0.0
3,SVM,0.722137,0.0,0.0,0.0
