## Goals
- Combine Preprocessor Pipeline with a model
- Select a model

## Imports

In [1]:
# Built-in
import os
import pickle

In [14]:
# Exploratory Data Analysis (Visualizing the data)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.data import Dataset

# Pipelines (Building the model pipelines)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from scikeras.wrappers import KerasClassifier

# Transformers (Transforming the data)
from sklearn.preprocessing import FunctionTransformer
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import cv2

# Scalers (Scaling the data)

# Selection (Selecting the data/feature)
from sklearn.model_selection import train_test_split

# Models (Building the models)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import MaxPooling2D, ReLU, Conv2D, Dense, Dropout, Flatten
from tensorflow.keras.applications import VGG16, ResNet50

# Parameter Tuning (Testing various paramaters)
from sklearn.model_selection import GridSearchCV

# Metrics (Evaluating the data)
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [6]:
# Local
from packages.preprocessing import Preprocessing

# Notebook Utilities
%matplotlib inline

## Functions

## Reading Data

In [7]:
data = pd.read_csv('../data/processed/train.csv')
v_data = pd.read_csv('../data/processed/validation.csv')

In [8]:
data.head()

Unnamed: 0,Image,Width,Height,Label
0,../data/processed/train/Apple Red 1/0_100.jpg,100,100,Apple Red 1
1,../data/processed/train/Apple Red 1/100_100.jpg,100,100,Apple Red 1
2,../data/processed/train/Apple Red 1/101_100.jpg,100,100,Apple Red 1
3,../data/processed/train/Apple Red 1/102_100.jpg,100,100,Apple Red 1
4,../data/processed/train/Apple Red 1/103_100.jpg,100,100,Apple Red 1


In [9]:
v_data.head()

Unnamed: 0,Image,Width,Height,Label
0,../data/processed/validation/Apple Red 1/321_1...,100,100,Apple Red 1
1,../data/processed/validation/Apple Red 1/322_1...,100,100,Apple Red 1
2,../data/processed/validation/Apple Red 1/323_1...,100,100,Apple Red 1
3,../data/processed/validation/Apple Red 1/324_1...,100,100,Apple Red 1
4,../data/processed/validation/Apple Red 1/325_1...,100,100,Apple Red 1


## Preprocessing Piplelines

In [10]:
# with open('../models/preprocessor.pkl', 'rb') as f:
#     preprocessor = pickle.load(f)
# preprocessor
preprocessor = Preprocessing()

## Model Pipeline

In [25]:
model = Sequential()

model.add(Conv2D(4, (5, 5), activation='relu', input_shape=(100, 100, 3)))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(16, (5, 5), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(32, (5, 5), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Conv2D(64, (5, 5), activation='relu'))
model.add(MaxPooling2D(2, 2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='relu'))

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [28]:
vgg_model = VGG16(input_ include_top=False, classes=3, pooling='max')

In [29]:
vgg_model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

## Train-Test Split

In [26]:
train = Dataset.load('../data/processed/train_data/')
validation = Dataset.load('../data/processed/validation_data/')

## Fitting the training values

In [27]:
model.fit(train, validation_data=validation, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x220072d7190>

In [None]:
vgg_model.fit(train, validation_data=validation, epochs=3)

Epoch 1/3

## Evaluating the Model

In [None]:
y_pred = pipe.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test,y_pred)
plt.figure(figsize=(4, 2))
plt.title("Confusion Matrix")
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")

In [None]:
print(classification_report(y_test, y_pred))

## Model and Parameters Selection

In [None]:
pipe.get_params()

In [None]:
# Pipeline Parameters
parameters = [
    {
        'classifier': [model],
        
        'preprocessor__Continuous__Scaler': [MinMaxScaler(), StandardScaler()],
        
        'preprocessor__Categorical__Encoder': [OneHotEncoder(drop='first', handle_unknown='ignore'),
                                               OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=10)],
    }, 
]

In [None]:
grid_search = GridSearchCV(pipe, parameters, cv=5, scoring='accuracy', verbose=2)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(x_test)

## Evaluating New Model

In [None]:
grid_search.score(x_train, y_train)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test,y_pred)
plt.figure(figsize=(4,2))
plt.title("Confusion Matrix")
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")

In [None]:
print(classification_report(y_test, y_pred))

## Saving the Model

In [None]:
with open('models/model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

## Saving the Data

In [None]:
x_train.to_csv('data/processed/x_train.csv', index=False)
x_test.to_csv('data/processed/x_test.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
y_test.to_csv('data/processed/y_test.csv', index=False)

## Saving the Predicted Values

In [None]:
results = pd.DataFrame(y_pred, columns=['Exited'])
results.to_csv('data/predicted/predicted.csv', index=False)
results.head()

## Conclusions
- Added Model to the pipeline
- Tried various parameters using GridSearchCV
- Got the model with best performance using GridSearchCV
- Evaluated the Model's Performance