# Simple Neural Network in Python using TensorFlow

<center><span style=
  "
  font-size: 50px; 
  font-weight: bold;
  color: Yellow;
  text-decoration: underline;
  text-decoration-color: White;
  "
>
   Activation Functions <!--  paste your text -->
  </span></center>


---

| Activation Function | Common Use | Advantages | Disadvantages |
|---|---|---|---|
| Sigmoid / Logistic | Predicting the probability as output | Outputs between 0 and 1, useful for binary classification, differentiable and provides smooth gradient | Suffers from vanishing gradient problem, output not symmetric around zero |
| Tanh (Hyperbolic Tangent) | Hidden layers of neural network | Output is zero-centered, helps centering the data | Suffers from vanishing gradient problem, gradient is steeper compared to sigmoid |
| ReLU (Rectified Linear Unit) | Hidden layers of neural network | Computationally efficient, accelerates convergence of gradient descent | Suffers from "dying ReLU" problem, all negative input values become zero |
| Leaky ReLU | To avoid dying ReLU problem | Enables backpropagation for negative input values, avoids dead neurons | Predictions may not be consistent for negative input values, learning of model parameters is time-consuming |
| Parametric ReLU | When Leaky ReLU fails at solving dead neurons problem | Slope of the negative part can be learnt during backpropagation | Performance varies depending on the value of slope parameter 'a' |
| Exponential Linear Units (ELUs) | Modifies slope of the negative part of the function | Smoothly approaches the value of -α for negative inputs, avoids dead ReLU problem | Increases computational time, no learning of the 'a' value, suffers from exploding gradient problem |
| Softmax | Output layer of classifier to represent probability distribution | Output is a probability distribution over 'n' classes | Limitations when dealing with non-exclusive classes |



In [None]:
# pip install pandas numpy matplotlib seaborn plotly scikit-learn scipy tensorflow

## Steps before Creating a Neural Network (Preprocess the Data)

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

### Preprocessing

In [None]:
#remove warnings
import warnings
warnings.filterwarnings('ignore')

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Preprocessing
# Dropping rows with missing 'age' and 'embarked' values
titanic.dropna(subset=['age', 'embarked'], inplace=True)

# Converting categorical variables to dummy variables
titanic = pd.get_dummies(titanic, columns=['sex', 'embarked', 'class', 'who', 'deck'], drop_first=True)

# Selecting features and target
X = titanic.drop(['survived', 'alive', 'embark_town', 'adult_male', 'alone'], axis=1)
y = titanic['survived']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

### Building the model

In [None]:
# define the layers of the model
input_layer = tf.keras.layers.Dense(10, # Number of neurons (units) in this layer — this layer will output a vector of size 10
                                    activation='relu', # The activation function for this layer, which introduces non-linearity to the model
                                    input_shape=(X_train.shape[1],) # Shape of the input vector. i.e; number of features in X_train
                                    )
# So this input layer will accept an input vector of size 18 (i.e., number of features), and output a vector of size 10

# Most of the time, "units" in the input layer are equal to the number of features in the dataset. It could also be multiple of the number of features, like 2x, 3x, etc. depending on the complexity of the dataset.

############ ANOTHER WAY TO DEFINE INPUT LAYER ############
# input_layer = tf.keras.layers.Input(shape=(X_train.shape[1],)) # Input layer

In [None]:
# hidden_layer = tf.keras.layers.Dense(10, activation='relu') # hidden layer

# number of hidden layers and neurons also depends on number of features and complexity of the dataset.

# Mostly non-linear activation functions are used in hidden layers. Most commenly used is relu and then, sigmoid and tanh.
# Usually, all hidden layers contain same activation function.
# Neural network types MLP and CNN make use of ReLU activation function while Recurrent networks still commonly use Tanh and Sigmoid activation functions

In [None]:
output_layer = tf.keras.layers.Dense(1, # Number of neurons (units) in this layer = 1 (because you're doing binary classification)
                                     activation='sigmoid' # The activation function for this layer, which is suitable for binary classification tasks (like 0 or 1)
                                     )
# So this layer takes input (10 values from previous layer), combines them using weights and bias, applies sigmoid, and gives output between 0 and 1.

# For regression problems, "units" in the output layer is usually 1, and the activation function is linear (i.e., no activation function).
# For binary classification problems, "units" in the output layer is usually 1, and the activation function is sigmoid.
# For multi-class classification problems, "units" in the output layer is equal to the number of classes, and the activation function is softmax. (multi-class means more than 2 classes, like: 0, 1, 2, 3 or dog, cat, bird, etc.)
# For multi-label classification problems, "units" in the output layer is equal to the number of labels, and the activation function is sigmoid.

In [None]:
# softmax is especially used in output layer of multi-class classification problems, where it converts the output into probabilities for each class.
# ReLU is most popularly used for hidden layers, as it helps the model learn complex patterns by introducing non-linearity.

In [None]:
# combine the layers into a model
model = tf.keras.models.Sequential([input_layer, 
                                    # hidden_layer, 
                                    output_layer])

# Compile the model
model.compile(optimizer='adam', # Optimizer used to update the weights during training. Adam is one of the best choices for most tasks.
              loss='binary_crossentropy', # Loss function used to measure prediction error. Binary cross-entropy is suitable for binary classification tasks.
              metrics=['accuracy']) # accuracy tells you how much predictions were correct.

### Some other optimizers:
# tf.keras.optimizers.SGD(0.001,0.9) --> SGD (Stochastic Gradient Descent): A basic optimizer that updates weights based on the gradient of the loss function. It can be slow to converge.

### Some other loss functions:
# 'mean_squared_error' --> Used for regression tasks where the goal is to minimize the squared difference between predicted and actual values.
# 'categorical_crossentropy' --> Used for multi-class classification tasks where the target variable has more than two classes. - When target labels are one-hot encoded vectors.
# 'sparse_categorical_crossentropy' --> Similar to categorical_crossentropy but used when target labels are integers (i.e; 0,1 2,3) instead of one-hot encoded vectors.

### Some other metrics:
# 'MAE' (Mean Absolute Error) --> Used for regression tasks to measure the average absolute difference between predicted and actual values.

### Train Model

In [None]:
%%time
# Training the model
model.fit(X_train, y_train, 
    epochs=100, # Number of times the model will go through the entire training dataset
    batch_size=32, # Number of samples processed per iteration. A smaller batch size means more updates per epoch, but slower training.
    verbose=1, # Shows logs during training. [0, 1, 2] where 0 = silent, 1 = progress bar, 2 = one line per epoch
    # validation_data=(X_test, y_test), # Data on which to evaluate the model after each epoch. Helps monitor overfitting.
    )

# if dataset is of 3200 rows then, no. of iterations per epoch = 3200/32 = 100
# hence, total no. of iterations = (no. of epochs) * (no. of iterations per epoch) = 100 * 100 = 10000

| Term           | Meaning                                                         |
| -------------- | --------------------------------------------------------------- |
| **epoch**      | 1 full pass over the entire training dataset                    |
| **batch size** | How many samples the model looks at **before updating weights** |


- Lets say, we have 3200 training examples in X_train

<img src="../0_resources/images/epoch_batchsize.png" />

### Evaluating the model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
y_pred = model.predict(X_test)  # Predicting the first 5 samples in the test set

print(f"Test Accuracy: {accuracy}")
print(f"Test Loss: {loss}")

## Let's see all the steps in action within one snippet of code

In [None]:
%%time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns

#remove warnings
import warnings
warnings.filterwarnings('ignore')

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Preprocessing
# Dropping rows with missing 'age' and 'embarked' values
titanic.dropna(subset=['age', 'embarked'], inplace=True)

# Converting categorical variables to dummy variables
titanic = pd.get_dummies(titanic, columns=['sex', 'embarked', 'class', 'who', 'deck'], drop_first=True)

# Selecting features and target
X = titanic.drop(['survived', 'alive', 'embark_town', 'adult_male', 'alone'], axis=1)
y = titanic['survived']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Building the model
input_layer = tf.keras.layers.Dense(10, activation='relu', input_shape=(X_train.shape[1],)) # input layer
# input_layer = tf.keras.layers.Input(shape=(X_train.shape[1],)) # another way to initialize input layer
# hidden_layer = tf.keras.layers.Dense(10, activation='relu') # hidden layer
output_layer = tf.keras.layers.Dense(1, activation='sigmoid') # output layer


model = tf.keras.models.Sequential([input_layer, 
                                    # hidden_layer, 
                                    output_layer])

######## Another way to Define the model ########
# model = tf.keras.models.Sequential()
# model.add(tf.keras.layers.Dense(10, activation='relu', input_shape=(X_train.shape[1],))) # input layer
# model.add(tf.keras.layers.Dense(1, activation='sigmoid')) # output layer

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
r = model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_test, y_test))

# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy}")
print(f"Test Loss: {loss}")

# **Assignment:** Plot the Training and Validation Accuracy and Loss for each epoch

In [None]:
# plot training and validation, loss and accuracy for each epoch
import matplotlib.pyplot as plt
history = model.history.history # another way to get history --> r.history
plt.figure(figsize=(12, 5))

print(history['accuracy']) # training accuracy
print(history['loss']) # training loss

print(history['val_accuracy']) # validation accuracy - This values will exist if "validation_data" is provided during model.fit
print(history['val_loss']) # validation loss - This values will exist if "validation_data" is provided during model.fit

# plot one line for accuracy and one for loss on the same graph without subplots
plt.plot(history['accuracy'], label='Accuracy')
plt.plot(history['loss'], label='Loss')
plt.title('Model Accuracy and Loss')
plt.xlabel('Epochs')
plt.ylabel('Value')
plt.legend()
plt.show()


# Check if GPU is available

In [None]:
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import scipy as sp
import tensorflow as tf
import platform

print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

---