# **Interview Prep**
#### The following python code will be practice for my interview with Shipium. I am giving
#### myself 30 minutes to complete a basic deep neural network using TensorFlow and using 
#### some assistance from the library documentation

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
df = pd.read_csv(r"c:\Users\selle\Downloads\shipping.csv")
df

In [None]:
# Renaming the Reached.on.Time_Y.N to be more readable
df.rename(columns={"Reached.on.Time_Y.N":"on_time"}, inplace=True)

# Lowercase all column names
df = df.rename(columns={col: col.lower() for col in df.columns})

In [None]:
df.head()

In [None]:
# Printing unique values for each field in the dataframe to get a better
# understanding of the data that we are working with
for column in df.columns:
    unique_values = df[column].nunique()
    print(f"Unique values in {column}: {unique_values}")

In [None]:
# Checking for null values. If null values are found then we will work to
# fill/replace/drop them
df.isnull().sum()

In [None]:
# Plot multiple histograms side by side
plt.figure(figsize=(10, 5))  # Adjust the figure size as needed

# Plot multiple histograms side by side in two rows and two columns
for i, column in enumerate(df.select_dtypes(exclude=np.number)):
    plt.subplot(2, 2, i+1)  # Create subplot for each column
    plt.xlabel(xlabel=f"{column}")
    sns.histplot(df[column])  # Plot histogram using Seaborn

plt.tight_layout(pad=3.0)  # Adjust subplot layout to prevent overlap
plt.show()

In [None]:
# Plot multiple histograms side by side
plt.figure(figsize=(15, 6))  # Adjust the figure size as needed

# Plot multiple histograms side by side in two rows and two columns
for i, column in enumerate(df.select_dtypes(include=np.number)):
    plt.subplot(3, 3, i+1)  # Create subplot for each column
    plt.xlabel(xlabel=f"{column}")
    sns.histplot(df[column])  # Plot histogram using Seaborn

plt.tight_layout(pad=3.0)  # Adjust subplot layout to prevent overlap
plt.show()

In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Separate numerical and non-numerical features
numerical_features = df.select_dtypes(include=np.number).columns

# Normalize numerical features only
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df

In [None]:
# Select non-numerical features
non_numerical_features = df.select_dtypes(exclude=['float64', 'int64']).columns

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Label encode non-numerical features
for feature in non_numerical_features:
    df[feature] = label_encoder.fit_transform(df[feature])

df

In [None]:
# Calculate correlation matrix to check for linearity between all variables
correlation_matrix = df.corr()

# Creating a mask to hide the upper half of the matrix since it is the mirrored
# values of the bottom half
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Plot correlation matrix as a heatmap
plt.figure(figsize=(8, 6))  # Set the figure size
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=mask)  # Plot heatmap with annotations
plt.title('Correlation Matrix')  # Add title
plt.show()

In [None]:
# Calculate Spearman correlation matrix so that we can test for non-linear correlations
spearman_corr_matrix = df.corr(method='spearman')

# Plot Spearman correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(spearman_corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask=mask)
plt.title('Spearman Correlation Matrix')
plt.show()

In [None]:
# numerical_features = df.select_dtypes(include=np.number).columns
sns.pairplot(df[['on_time', "discount_offered"]])
plt.show()

In [None]:
# Creating y and X variable to store the label and feature(s) to
# later split into training, validation, and testing sets
X = df.drop(["on_time"], axis=1)
y = df["on_time"]

# We will now split the data into two sets being training and remaining
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.7, random_state=0, stratify=df["on_time"])

# Now we split the remaining set into two sets being the validation and testing
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=df["on_time"])

In [None]:
# Checking the shape to ensure data was successfully and correctly split
# according to the train_size and test_size parameters

print(
    f"{X_train.shape = }", f"{y_train.shape = }",
    f"\n{X_val.shape = }", f"{y_val.shape = }",
    f"\n{X_test.shape = }", f"{y_test.shape = }",
)

In [None]:
# Now we will train our model using deep learning neural networks

# Building out the neural network
model1 = tf.keras.Sequential()
model1.add(tf.keras.layers.Dense(10, activation="relu"))
model1.add(tf.keras.layers.Dense(1))
model1.add(tf.keras.layers.Dense(10, activation="relu"))
model1.add(tf.keras.layers.Dense(1))
model1.add(tf.keras.layers.Dense(10, activation="tanh"))
model1.add(tf.keras.layers.Dense(1))

# Compiling the model to later fit it
learning_rate = 0.1
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model1.compile(
    loss = "mse",
    optimizer=optimizer,
    metrics = [tf.keras.losses.mean_squared_error]
)

es = tf.keras.callbacks.EarlyStopping(monitor="loss")

# Fitting the model with training and validation data
history = model1.fit(X_train, y_train, epochs=100, batch_size=50, validation_data=(X_val, y_val), verbose=2)

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model1.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)