<a href="https://colab.research.google.com/github/LanceAlcala/CPEN-research/blob/main/Copy_of_Draft_codes_for_Lab_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os
import pandas as pd

np.random.seed(42)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

##Data Preprocessing

In [None]:
dataset = pd.read_csv("/content/sample_data/seattle-weather.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [None]:
dataset.duplicated().sum()

0

In [None]:
dataset['weather'].unique()

array(['drizzle', 'rain', 'sun', 'snow', 'fog'], dtype=object)

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

dataset['weather_label'] = LE.fit_transform(dataset['weather'])
dataset.head(20)

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2
5,2012-01-06,2.5,4.4,2.2,2.2,rain,2
6,2012-01-07,0.0,7.2,2.8,2.3,rain,2
7,2012-01-08,0.0,10.0,2.8,2.0,sun,4
8,2012-01-09,4.3,9.4,5.0,3.4,rain,2
9,2012-01-10,1.0,6.1,0.6,3.4,rain,2


In [None]:
weather_dictionary = dict(zip(dataset['weather_label'], dataset['weather']))
weather_dictionary

{0: 'drizzle', 2: 'rain', 4: 'sun', 3: 'snow', 1: 'fog'}

In [None]:
def date_time(dataset) :

    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day

    return dataset

dataset_final = date_time(dataset)
dataset_final

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather,weather_label,year,month,day
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle,0,2012,1,1
1,2012-01-02,10.9,10.6,2.8,4.5,rain,2,2012,1,2
2,2012-01-03,0.8,11.7,7.2,2.3,rain,2,2012,1,3
3,2012-01-04,20.3,12.2,5.6,4.7,rain,2,2012,1,4
4,2012-01-05,1.3,8.9,2.8,6.1,rain,2,2012,1,5
...,...,...,...,...,...,...,...,...,...,...
1456,2015-12-27,8.6,4.4,1.7,2.9,rain,2,2015,12,27
1457,2015-12-28,1.5,5.0,1.7,1.3,rain,2,2015,12,28
1458,2015-12-29,0.0,7.2,0.6,2.6,fog,1,2015,12,29
1459,2015-12-30,0.0,5.6,-1.0,3.4,sun,4,2015,12,30


In [None]:
dataset_final = dataset_final.drop(['weather'],axis=1).set_index('date')
dataset_final

Unnamed: 0_level_0,precipitation,temp_max,temp_min,wind,weather_label,year,month,day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-01-01,0.0,12.8,5.0,4.7,0,2012,1,1
2012-01-02,10.9,10.6,2.8,4.5,2,2012,1,2
2012-01-03,0.8,11.7,7.2,2.3,2,2012,1,3
2012-01-04,20.3,12.2,5.6,4.7,2,2012,1,4
2012-01-05,1.3,8.9,2.8,6.1,2,2012,1,5
...,...,...,...,...,...,...,...,...
2015-12-27,8.6,4.4,1.7,2.9,2,2015,12,27
2015-12-28,1.5,5.0,1.7,1.3,2,2015,12,28
2015-12-29,0.0,7.2,0.6,2.6,1,2015,12,29
2015-12-30,0.0,5.6,-1.0,3.4,4,2015,12,30


In [None]:
X = dataset_final.drop(['weather_label'], axis = 1)
y = dataset_final['weather_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

k = 20
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

scaler = StandardScaler()
accuracy_scores = []

for train_index, test_index in kfold.split(X):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Model

In [None]:
nb = GaussianNB(var_smoothing = 1e-09)
svm = SVC(C=100, kernel='linear', gamma='scale')

NBModel = BaggingClassifier(estimator=nb, n_estimators=10, random_state=42)
SVMModel = BaggingClassifier(estimator=svm, n_estimators=1, random_state=42)
HardVoting = VotingClassifier(estimators=[('NB Model', NBModel),('SVM Model', SVMModel)], voting='hard')
HardVoting.fit(X_train, y_train)

In [None]:
NBModel.fit(X_train, y_train)
y_pred_NB = NBModel.predict(X_test)
print("NB Model:", accuracy_score(y_test, y_pred_NB))

SVMModel.fit(X_train, y_train)
y_pred_SVM =SVMModel.predict(X_test)
print("SVM Model:", accuracy_score(y_test, y_pred_SVM))

y_pred_HV = HardVoting.predict(X_test)
print('Hard Voting Score:',accuracy_score(y_test, y_pred_HV))

NB Model: 0.9178082191780822
SVM Model: 0.9041095890410958
Hard Voting Score: 0.9315068493150684


#Evaluation(???)

In [None]:
print("Naive Bayes Model")
print("Accuracy:", accuracy_score(y_test, y_pred_NB))
print("Precision:", precision_score(y_test, y_pred_NB, average= 'weighted'))
print("Recall:", recall_score(y_test, y_pred_NB, average= 'weighted'))
print("F1 Score:", f1_score(y_test, y_pred_NB, average= 'weighted'))

Naive Bayes Model
Accuracy: 0.9178082191780822
Precision: 0.9244979623901408
Recall: 0.9178082191780822
F1 Score: 0.9106099444715088


In [None]:
print("Support Vector Machine")
print("Accuracy:", precision_score(y_test, y_pred_SVM, average= 'weighted', labels=np.unique(y_pred_SVM)))
print("Precision:", recall_score(y_test, y_pred_SVM, average= 'weighted'))
print("Recall:", recall_score(y_test, y_pred_SVM, average= 'weighted'))
print("F1 Score:", f1_score(y_test, y_pred_SVM, average= 'weighted'))

Support Vector Machine
Accuracy: 0.9131016042780749
Precision: 0.9041095890410958
Recall: 0.9041095890410958
F1 Score: 0.8716894977168951


In [None]:
print("Hard Voting Ensemble Model")
print("Accuracy:", accuracy_score(y_test, y_pred_HV))
print("Precision:", precision_score(y_test, y_pred_HV, average= 'weighted', labels=np.unique(y_pred_HV)))
print("Recall:", recall_score(y_test, y_pred_HV, average= 'weighted'))
print("F1 Score:", f1_score(y_test, y_pred_HV, average= 'weighted'))

Hard Voting Ensemble Model
Accuracy: 0.9315068493150684
Precision: 0.9385771100309324
Recall: 0.9315068493150684
F1 Score: 0.9223069482871089
