### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_score, r2_score, mean_squared_error, mean_absolute_error

### Import DataSet

In [2]:
#path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillUp/labs/ML-FinalAssignment/Weather_Data.csv'

# Import Data
df = pd.read_csv('Weather_Data.csv')
# check datatype and null values
df.info()
df.isnull().sum()
# There is no null value
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3271 entries, 0 to 3270
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           3271 non-null   object 
 1   MinTemp        3271 non-null   float64
 2   MaxTemp        3271 non-null   float64
 3   Rainfall       3271 non-null   float64
 4   Evaporation    3271 non-null   float64
 5   Sunshine       3271 non-null   float64
 6   WindGustDir    3271 non-null   object 
 7   WindGustSpeed  3271 non-null   int64  
 8   WindDir9am     3271 non-null   object 
 9   WindDir3pm     3271 non-null   object 
 10  WindSpeed9am   3271 non-null   int64  
 11  WindSpeed3pm   3271 non-null   int64  
 12  Humidity9am    3271 non-null   int64  
 13  Humidity3pm    3271 non-null   int64  
 14  Pressure9am    3271 non-null   float64
 15  Pressure3pm    3271 non-null   float64
 16  Cloud9am       3271 non-null   int64  
 17  Cloud3pm       3271 non-null   int64  
 18  Temp9am 

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


### Data Pre-Processing
* USe One-Hot-Encoding For convert Categorical Variables to Numeric Variables
* One-Hot-Encoding Notes:
*   1. sparse_output=False :  is used to get a dense array, which we convert to a DataFrame for easy readability.
*   2. drop = None : This means no categories will be dropped during the encoding process

In [3]:
# Initialize One Hot Encoder
encoder = OneHotEncoder(sparse_output=False, drop = None)

# Fit and Transform the data
encoded_features = encoder.fit_transform(df[['RainToday','WindGustDir', 'WindDir9am', 'WindDir3pm']])

# Convert the data into DataFrame for better readability
encoded_features = pd.DataFrame(encoded_features, columns= encoder.get_feature_names_out(['RainToday', 'WindGustDir', 'WindDir9am', 'WindDir3pm'])) 

# check the encoded values
encoded_features.head()

# drop the Original Categorical Features
columns_to_drop = ['RainToday','WindGustDir', 'WindDir9am', 'WindDir3pm']
df = df.drop(columns = [col for col in columns_to_drop if col in df.columns],axis=1)
# Concatenate encoded data with Original Data
df_encoded = pd.concat([df, encoded_features], axis=1)
df_encoded.head()

# RainTomorrow is target Value and we need to convert it to numerical value
# if we Encoder it will generate 2 columns for RainTomorrow, and we dont want it, So use replace method to replace yes,no with 1,0
pd.set_option('future.no_silent_downcasting', True)
df_encoded['RainTomorrow'] = df_encoded['RainTomorrow'].replace(['No', 'Yes'], [0,1]).astype(int)


df_encoded.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir3pm_NNW,WindDir3pm_NW,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,41,17,20,92,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,41,9,13,83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,41,17,2,88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,41,22,20,83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,41,11,6,88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Train Data and Test Data
* Set X for features and y for Target Value

In [4]:
y = df_encoded['RainTomorrow']
X = df_encoded.drop(['RainTomorrow', 'Date'], axis=1)
X.astype(float)
X.head()

# Split the Data into Train and Test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

### Model Linear Regression and Evaluate the performance of the Model
* MAE = average of the absolute differences between actual and predicted values
* MSE = average of the squared differences between actual and predicted values
    * MSE penalizing larger errors more heavily due to the squaring of the error terms.  


In [13]:
LinearReg = LinearRegression()
LinearReg.fit(x_train, y_train)
LinearReg_yhat = LinearReg.predict(x_test)

LinearReg_R2 = r2_score(y_test, LinearReg_yhat)
LinearReg_MAE = mean_absolute_error(y_test, LinearReg_yhat)
LinearReg_MSE = mean_squared_error(y_test, LinearReg_yhat)

print(f'Linear Regression R2 Score : {LinearReg_R2}')
print(f'Linear Regression MAE : {LinearReg_MAE}')
print(f'Linear Regression MSE : {LinearReg_MSE}')

Linear Regression R2 Score : 0.4271321073623009
Linear Regression MAE : 0.25631760994203784
Linear Regression MSE : 0.11572058282746588
