<a href="https://colab.research.google.com/github/KevinHern/SP1-Educational-Material/blob/main/examples/SP1_Example_ML_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preparations

In [36]:
# ----- Libraries ----- #

# This is the main Library that allows us to work with Neural Networks
import tensorflow as tf


# For graph plotting
import matplotlib.pyplot as plt
from tensorflow.math import confusion_matrix

# For dataset manipulation
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# For visualizing more complex graphs
import seaborn as sns

# Miscellaneous Libraries
import os

# Global constant for training acceleration
AUTOTUNE = tf.data.AUTOTUNE

# 1) Dataset Preparations

In [None]:
'''
The dataset you are going to use is the following:
https://archive.ics.uci.edu/ml/datasets/Wine+Quality
'''

raw_train_dataset = pd.read_csv("wine_quality_training.csv")
raw_train_dataset

## Glimpse

In [None]:
# Brief Statistical Summary of the dataset
raw_train_dataset.describe()

In [None]:
# Check all columns' datatypes
raw_train_dataset.dtypes

In [None]:
# checking shape
raw_train_dataset.shape

# 2) Pre-processing

## Missing Values

In [41]:
# Check for Missing Values and do something about them!
def missing_values_cleanup(dataset):
  new_dataset = dataset.copy()

  # Lets check for null values
  print(new_dataset.isna().sum())

  # TODO: Drop Missing values!

  # Checking new dataset
  return new_dataset

## Encoding

In [None]:
# Check for All possible unique values for the String column in the train dataset
# TODO: Check the values!

In [None]:
# Check for Test too!
raw_test_dataset = pd.read_csv("wine_quality_evaluation.csv")
raw_test_dataset

# TODO: Check test dataset values!

In [49]:
# With this information, we can perform mapping. We have to a define a function that maps all values
# TODO: Finish this function!
def map_fixed_acidity(x):
  if x == 'Cero':
    return None
  elif x == 'One':
    return None
  elif x == 'Two':
    return None
  elif x == 'Three':
    return None
  elif x == 'Four':
    return None
  elif x == 'Five':
    return None
  elif x == 'Six':
    return None
  elif x == 'Seven':
    return None
  elif x == 'Eight':
    return None
  elif x == 'Nine':
    return None
  elif x == 'Ten':
    return None
  elif x == 'Eleven':
    return None
  elif x == 'Twelve':
    return None
  elif x == 'Thirteen':
    return None
  elif x == 'Fourteen':
    return None
  elif x == 'Fifteen':
    return None
  elif x == 'Seventeen':
    return None
  else:  # In case we missed a value, we return None
    return None

In [73]:
def encoding_cleanup(dataset):
  new_dataset = dataset.copy()

  # TODO: Execute the mapping here!

  return new_dataset

## Normalization

## Normalization Functions

In [51]:
def normalize_min_max(column):
  max_value = np.max(column)
  min_value = np.min(column)
  return (column - min_value)/(max_value - min_value)

def z_normalization(column):
    return (column - column.mean()) / column.std()

In [77]:
def normalize_columns(dataset, columns):
  new_dataset = dataset.copy()

  # TODO: Execute the normalization
  for column in columns:
    pass

  return new_dataset

In [None]:
# Lets remember
print(raw_train_dataset.columns)
raw_train_dataset.dtypes

In [None]:
# TODO: Fill the array
columns_to_normalize = []
columns_to_normalize

## Balancing Classes

In [None]:
for i in raw_train_dataset["quality"].unique():
  numRows = len(raw_train_dataset[raw_train_dataset['quality'] == i])
  print("Class", i, ": ", numRows)

# Setting style of the graph
sns.set(style="darkgrid")

# Plotting a histogram
sns.histplot(data=raw_train_dataset, x="quality")
plt.show()

In [57]:
# Mapping quality values
# TODO: Complete the function!
def map_quality(x):
  pass

In [None]:
dummy_dataset = raw_train_dataset.copy()

dummy_dataset['quality'] = dummy_dataset['quality'].apply(map_quality)

for i in dummy_dataset["quality"].unique():
  numRows = len(dummy_dataset[dummy_dataset['quality'] == i])
  print("Class", i, ": ", numRows)

# Setting style of the graph
sns.set(style="darkgrid")

# Plotting a histogram
sns.histplot(data=dummy_dataset, x="quality")
plt.show()

In [125]:
def undersample_classes(dataset, target_column, target_class):
  new_dataset = dataset.copy()

  # Lets extract how many classes there are in the target class
  values = new_dataset[target_column].value_counts()
  reference_class_count = values[target_class]

  # Lets extract all data of the target class
  undersampled_dataset = new_dataset[new_dataset[target_column] == target_class]

  classes = list(new_dataset[target_column].unique())
  classes.remove(target_class)

  # TODO: Complete For loop
  for data_class in classes:
    pass

  # Shuffling
  undersampled_dataset = undersampled_dataset.sample(frac=1).reset_index(drop=True)

  return undersampled_dataset

## Executing Pre-Processing

In [126]:
def pre_process_dataset(dataset, class_column, perform_undersample=False):
  missingless_dataset = missing_values_cleanup(dataset=dataset)
  encoded_dataset = encoding_cleanup(dataset=missingless_dataset)
  normalized_dataset = normalize_columns(dataset=encoded_dataset, columns=columns_to_normalize)
  normalized_dataset[class_column] = normalized_dataset[class_column].apply(map_quality)

  pre_processed_dataset = None
  if perform_undersample:
    # TODO: Complete paramerets
    pre_processed_dataset = undersample_classes(dataset=normalized_dataset, target_column=None, target_class=None)
  else:
    pre_processed_dataset = normalized_dataset

  # Sepparating dependent and independent variables
  independent_variables = list(pre_processed_dataset.columns)
  independent_variables.remove(class_column)
  dependent_variables = [class_column]

  values_set = pre_processed_dataset[independent_variables]
  values_target = pre_processed_dataset[dependent_variables]

  return values_set, values_target

# 3) AI Model

## AI Model

In [None]:
len(raw_train_dataset.columns)

In [136]:
# TODO: Build your own model!
model = tf.keras.models.Sequential([
  tf.keras.layers.InputLayer(input_shape=(0)),
  tf.keras.layers.Dense(units=32, activation='relu'),
  ])

# TODO: Add the finishing touches!
model.compile(loss=None, optimizer='adam', metrics=['accuracy'])

## Callbacks

In [137]:
# TODO: Adjsut Earlystopping Callback
earlystopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=1, patience=1,)

In [None]:
# Setting up Tensorboard
%load_ext tensorboard
%mkdir logs & rm -rf ./logs/

import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

### Architecture

### Training

In [None]:
# TODO: Complete the Training
train_set, train_target = (None, None)
train_set

In [None]:
train_set

In [None]:
model.fit(None,
          None,
          epochs=150,
          batch_size=128,
          validation_split=0.2,
          callbacks=[earlystopping_callback, tensorboard_callback]
        )

### Result Visualization

In [None]:
# Lets open up the dashboard and check the training process
%tensorboard --logdir logs/fit

# 4) Evaluation

In [None]:
# TODO: Finish last step
test_set, test_target = (None, None)

model.evaluate(test_set, test_target)