In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
import sklearn.metrics as metrics

### Importing the Dataset

In [2]:
path='https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'

In [3]:
df = pd.read_csv("Weather_Data.csv")
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Data Preprocessing

#### One Hot Encoding

First, we need to perform one hot encoding to convert categorical variables to binary variables.

In [4]:
df_sydney_processed = pd.get_dummies(data=df, columns=['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])

Next, we replace the values of the 'RainTomorrow' column changing them from a categorical column to a binary column. We do not use the `get_dummies` method because we would end up with two columns for 'RainTomorrow' and we do not want, since 'RainTomorrow' is our target.


In [5]:
df_sydney_processed.replace(['No', 'Yes'], [0,1], inplace=True)

### Training Data and Test Data

Now, we set our 'features' or x values and our Y or target variable.

In [6]:
df_sydney_processed.drop('Date',axis=1,inplace=True)

In [7]:
df_sydney_processed = df_sydney_processed.astype(float)

In [8]:
features = df_sydney_processed.drop(columns='RainTomorrow', axis=1)
Y = df_sydney_processed['RainTomorrow']

### Linear Regression

#### Q1) Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `10`.

In [9]:
x_train, x_test, y_train, y_test= train_test_split(features, Y, test_size=0.2, random_state=10)
x_train, x_test, y_train, y_test

(      MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
 3188     14.8     22.0      33.8          4.2       1.5           50.0   
 2582      8.1     18.4       0.0          4.8       8.5           41.0   
 815      15.4     21.1       0.0          3.8       5.9           41.0   
 1024     20.0     26.5       0.0          8.6      13.1           30.0   
 1320     14.8     18.3      38.8          7.4       0.1           48.0   
 ...       ...      ...       ...          ...       ...            ...   
 3195     11.7     17.9       8.8          7.8       6.1           54.0   
 1344     17.0     21.7       0.0          5.4      10.7           26.0   
 527       6.7     17.3       0.0          2.6       6.9           41.0   
 3197     15.0     22.7       9.4          4.2       2.8           43.0   
 1289     15.9     30.1       0.2          7.6      12.2           33.0   
 
       WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  \
 3188          19.0          2

#### Q2) Create and train a Linear Regression model called LinearReg using the training data (`x_train`, `y_train`).

In [10]:
# Create the Linear Regression model
LinearReg = LinearRegression()

# Train the model using the training data
LinearReg.fit(x_train, y_train)

LinearReg

LinearRegression()

#### Q3) Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.

In [11]:
predictions = LinearReg.predict(x_test)
predictions

array([ 1.31813049e-01,  2.76214600e-01,  9.78200912e-01,  2.87437439e-01,
        1.32423401e-01,  4.60474014e-01,  3.56777191e-01,  8.56437683e-01,
        6.75012589e-01,  3.82213593e-02,  4.75692749e-03,  2.81200409e-01,
        3.39061737e-01,  7.81154633e-02,  6.25858307e-02,  5.64435959e-01,
       -6.15749359e-02,  5.24192810e-01,  1.53690338e-01,  3.59710693e-01,
        6.05564117e-02,  9.03549194e-01,  4.67275620e-01,  2.03372955e-01,
       -7.10105896e-02,  3.83895874e-01,  5.36081314e-01, -2.28939056e-02,
        6.40096664e-01, -9.56516266e-02,  3.78076553e-01,  1.20264053e-01,
       -1.81388855e-02,  5.53817749e-02,  5.63533783e-01,  1.06299210e+00,
       -6.71958923e-03,  5.14400482e-01, -8.83979797e-02,  6.91871643e-02,
        2.44693756e-02,  8.71761322e-01,  2.44667053e-01,  3.94721985e-01,
        2.67553329e-01,  4.46792603e-01, -4.75807190e-02,  1.89435959e-01,
        7.76603699e-01,  1.57752991e-01,  3.97872925e-03, -5.19638062e-02,
        2.07319260e-01, -

#### Q4) Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.


In [12]:
LinearRegression_MAE = metrics.mean_absolute_error(y_test, predictions)
LinearRegression_MSE = metrics.mean_squared_error(y_test, predictions)
LinearRegression_R2 = metrics.r2_score(y_test, predictions)

print("LinearRegression_MAE:",LinearRegression_MAE)
print("LinearRegression_MSE:",LinearRegression_MSE)
print("LinearRegression_R2:",LinearRegression_R2)

LinearRegression_MAE: 0.25631593311105977
LinearRegression_MSE: 0.11571951119153882
LinearRegression_R2: 0.42713741243249836


#### Q5) Show the MAE, MSE, and R2 in a tabular format using data frame for the linear model.


In [13]:
Report = pd.DataFrame({
    'Metric': ['Mean Absolute Error', 'Mean Squared Error', 'R-squared'],
    'Value': [LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2]
})

print(Report)

                Metric     Value
0  Mean Absolute Error  0.256316
1   Mean Squared Error  0.115720
2            R-squared  0.427137


### KNN

#### Q6) Create and train a KNN model called KNN using the training data (`x_train`, `y_train`) with the `n_neighbors` parameter set to `4`.

In [14]:
KNN = KNeighborsClassifier(n_neighbors=4)

KNN.fit(x_train,y_train)

KNN

KNeighborsClassifier(n_neighbors=4)

#### Q7) Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.

In [15]:
predictions = KNN.predict(x_test)
predictions

array([0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0.

#### Q8) Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.

In [16]:
KNN_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
KNN_JaccardIndex = metrics.jaccard_score(y_test, predictions)
KNN_F1_Score = metrics.f1_score(y_test, predictions)

print("KNN_Accuracy_Score:",KNN_Accuracy_Score)
print("KNN_JaccardIndex:",KNN_JaccardIndex)
print("KNN_F1_Score:",KNN_F1_Score)

KNN_Accuracy_Score: 0.8183206106870229
KNN_JaccardIndex: 0.4251207729468599
KNN_F1_Score: 0.5966101694915255


### Decision Tree

#### Q9) Create and train a Decision Tree model called Tree using the training data (`x_train`, `y_train`).

In [17]:
# Create the Decision Tree model
Tree = DecisionTreeClassifier()

# Train the model using the training data
Tree.fit(x_train, y_train)

Tree

DecisionTreeClassifier()

#### Q10) Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.

In [18]:
predictions = Tree.predict(x_test)
predictions

array([0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 1.

#### Q11) Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.

In [19]:
Tree_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
Tree_JaccardIndex = metrics.jaccard_score(y_test, predictions)
Tree_F1_Score = metrics.f1_score(y_test, predictions)

print("Tree_Accuracy_Score:",Tree_Accuracy_Score)
print("Tree_JaccardIndex:",Tree_JaccardIndex)
print("Tree_F1_Score:",Tree_F1_Score)

Tree_Accuracy_Score: 0.766412213740458
Tree_JaccardIndex: 0.4226415094339623
Tree_F1_Score: 0.5941644562334217


### Logistic Regression

#### Q12) Use the `train_test_split` function to split the `features` and `Y` dataframes with a `test_size` of `0.2` and the `random_state` set to `1`.


In [20]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, test_size =0.2, random_state = 1)

x_train, x_test, y_train, y_test

(      MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
 1264     12.3     19.7      30.0          4.0       7.2           44.0   
 3113     25.1     38.1       0.0         13.8      11.1           59.0   
 1230     12.2     17.5       0.0          2.8       3.0           41.0   
 1221     10.3     16.5       1.0          2.2       0.8           24.0   
 3070     20.4     23.6       3.2          7.0       0.0           33.0   
 ...       ...      ...       ...          ...       ...            ...   
 2763     19.5     28.7       9.0          3.0       3.0           70.0   
 905       6.8     16.1       0.0          4.2      10.2           41.0   
 1096     20.6     28.1       1.0          3.2       9.8           41.0   
 235      10.3     18.6       0.0          4.0      10.9           41.0   
 1061     24.2     41.5       0.0          8.4      12.4           56.0   
 
       WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  \
 1264          24.0          1

#### Q13) Create and train a LogisticRegression model called LR using the training data (`x_train`, `y_train`) with the `solver` parameter set to `liblinear`

In [21]:
LR = LogisticRegression(solver ="liblinear").fit(x_train,y_train)

LR

LogisticRegression(solver='liblinear')

#### Q14) Now, use the `predict` and `predict_proba` methods on the testing data (`x_test`) and save it as 2 arrays `predictions` and `predict_proba`.

In [22]:
predictions = LR.predict(x_test)
predictions

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 1., 0.

In [23]:
predict_proba = LR.predict(x_test)

predict_proba

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 1., 0.

#### Q15) Using the `predictions`, `predict_proba` and the `y_test` dataframe calculate the value for each metric using the appropriate function.

In [26]:
LR_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
LR_JaccardIndex = metrics.jaccard_score(y_test, predictions)
LR_F1_Score = metrics.f1_score(y_test, predictions)
LR_Log_Loss = metrics.log_loss(y_test,predict_proba)

print("LR_Accuracy_Score:",LR_Accuracy_Score)
print("LR_JaccardIndex:",LR_JaccardIndex)
print("LR_F1_Score:",LR_F1_Score)
print("LR_Log_Loss:",LR_Log_Loss)

LR_Accuracy_Score: 0.8351145038167939
LR_JaccardIndex: 0.5045871559633027
LR_F1_Score: 0.6707317073170731
LR_Log_Loss: 5.69498723077533


### SVM


#### Q16) Create and train a SVM model called SVM using the training data (`x_train`, `y_train`).

In [27]:
# Create the SVM model
SVM = svm.SVC()

# Train the SVM model using the training data
SVM.fit(x_train, y_train)

SVC()

#### Q17) Now use the `predict` method on the testing data (`x_test`) and save it to the array `predictions`.

In [28]:
predictions = SVM.predict(x_test)
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

#### Q18) Using the `predictions` and the `y_test` dataframe calculate the value for each metric using the appropriate function.

In [29]:
SVM_Accuracy_Score = metrics.accuracy_score(y_test, predictions)
SVM_JaccardIndex = metrics.jaccard_score(y_test, predictions)
SVM_F1_Score = metrics.f1_score(y_test, predictions)

print("SVM_Accuracy_Score:",SVM_Accuracy_Score)
print("SVM_JaccardIndex:",SVM_JaccardIndex)
print("SVM_F1_Score:",SVM_F1_Score)

SVM_Accuracy_Score: 0.7221374045801526
SVM_JaccardIndex: 0.0
SVM_F1_Score: 0.0


### Report

#### Q19) Show the Accuracy,Jaccard Index,F1-Score and LogLoss in a tabular format using data frame for all of the above models.

\*LogLoss is only for Logistic Regression Model

In [36]:
Report = {
    'Model': ['Linear Regression', 'KNN', 'Decision Tree', 'Logistic Regression', 'SVM'],
    'Accuracy': [LR_Accuracy_Score, KNN_Accuracy_Score, Tree_Accuracy_Score, LR_Accuracy_Score, SVM_Accuracy_Score],
    'Jaccard Index': [LR_JaccardIndex, KNN_JaccardIndex, Tree_JaccardIndex, LR_JaccardIndex, SVM_JaccardIndex],
    'F1-Score': [LR_F1_Score, KNN_F1_Score, Tree_F1_Score, LR_F1_Score, SVM_F1_Score],
    'Log Loss': [LR_Log_Loss, '-', '-', LR_Log_Loss, '-']
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(Report)

# Display the DataFrame
print(Report)


{'Model': ['Linear Regression', 'KNN', 'Decision Tree', 'Logistic Regression', 'SVM'], 'Accuracy': [0.8351145038167939, 0.8183206106870229, 0.766412213740458, 0.8351145038167939, 0.7221374045801526], 'Jaccard Index': [0.5045871559633027, 0.4251207729468599, 0.4226415094339623, 0.5045871559633027, 0.0], 'F1-Score': [0.6707317073170731, 0.5966101694915255, 0.5941644562334217, 0.6707317073170731, 0.0], 'Log Loss': [5.69498723077533, '-', '-', 5.69498723077533, '-']}
