## Prepare python environment


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
random_state = 5 # use this to control randomness across runs e.g., dataset partitioning

## Preparing the Credit Card Fraud Detection dataset (2 points)

The dataset contains transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
See [here](https://www.kaggle.com/mlg-ulb/creditcardfraud) for details of the dataset. We will post process the data to balance both the classes.

### Loading the dataset

In [None]:
# Download and load the dataset
import os
if not os.path.exists('creditcard.csv'): 
    !wget https://raw.githubusercontent.com/JHA-Lab/ece364_2022/master/dataset/creditcard.zip
    !unzip creditcard.zip

df = pd.read_csv("creditcard.csv")
print(df.head())

In [None]:
# Check the datatype of each column
df.info()

#### There are a total of 284,807 entries in this dataset with no missing values. The first 30 columns are features and the last column indicates whether the transaction is fraud or not.

#### Use the `describe` function to display some statistics of the data. See [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html) for details about this function.

In [None]:
# Look at some statistics of the data using the 'describe' function
df.describe()

1. Count tells us the number of non-empty rows in a feature.

2. Mean tells us the mean value of that feature.

3. Std tells us the standard deviation of that feature.

4. Min tells us the minimum value of that feature.

5. 25%, 50%, and 75% are the percentiles/quartiles of each feature.

6. Max tells us the maximum value of that feature.

#### Visualize the distribution of fraudulent vs genuine transactions

In [None]:
# Make a pie chart showing transaction type
fig, ax = plt.subplots(1, 1)
ax.pie(df.Class.value_counts(),autopct='%1.1f%%', labels=['Genuine','Fraud'], colors=['green','red'])
plt.axis('equal');
plt.ylabel('');

In [None]:
# Check fradulent activity over time (note: total time is 48 hours)
df["Time_Hr"] = df["Time"]/3600 # convert to hours
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(df.Time_Hr[df.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(df.Time_Hr[df.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Time (hrs)')
plt.ylabel('# transactions');

In [None]:
# Remove the 'Time' feature as it is already captured when converting to hours
df = df.drop(['Time'],axis=1)

#### Create a balanced dataset with 50% from each class

In [None]:
fraud_indices = np.array(df[df.Class == 1].index) # indices corresponding to fraud transaction
genuine_ind = df[df.Class == 0].index # indices corresponding to genuine transaction
total_fraud_transactions = len(df[df.Class == 1]) # total transactions that were fraud
np.random.seed(0) # fix the random seed generator for consistent results
indices_genuine_transaction = np.random.choice(genuine_ind, total_fraud_transactions, replace = False)
indices_genuine_transaction = np.array(indices_genuine_transaction)
selected_balanced_indices = np.concatenate([fraud_indices,indices_genuine_transaction]) # indices for balanced data
balanced_data = df.iloc[selected_balanced_indices,:]

In [None]:
print("% genuine transactions: ",len(balanced_data[balanced_data.Class == 0])/len(balanced_data))
print("% fraud transactions: ",len(balanced_data[balanced_data.Class == 1])/len(balanced_data))

# Make a pie chart showing transaction type
fig, ax = plt.subplots(1, 1)
ax.pie(balanced_data.Class.value_counts(),autopct='%1.1f%%', labels=['Genuine','Fraud'], colors=['green','red'])
plt.axis('equal')
plt.ylabel('');

### Extract target and descriptive features (0.5 points)

In [None]:
# Store all the features from the data in X
X= # insert your code here
# Store all the labels in y
y= # insert your code here

In [None]:
# Convert data to numpy array
X = # insert your code here
y = # insert your code here

### Create training and validation datasets (0.5 points)

Split the data into training and validation sets using `train_test_split`.  See [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) for details. To get consistent result while splitting, set `random_state` to the value defined earlier. We use 80% of the data for training and 20% of the data for validation. This has been done for you.

In [None]:
X_train, X_val, y_train, y_val = # insert your code here # 80% training and 20% validation

### Preprocess the dataset (1 point)

#### Preprocess the data by normalizing each feature to have zero mean and unit standard deviation. This can be done using the `StandardScaler()` function. See [here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) for more details.


In [None]:
# Define the scaler for scaling the data
scaler = # insert your code here 

# Normalize the training data
X_train = # insert your code here

# Use the scaler defined above to standardize the validation data by applying the same transformation to the validation data.
X_val = # insert your code here


## Training a Multi-Layer Perceptron (18 points)


#### We will use `sklearn's` neural network library to train a multi-layer perceptron for classification. The model is trained to optimize the cross-entropy loss using stochastic gradient descent. Review ch.8 and see [here](https://scikit-learn.org/stable/modules/neural_networks_supervised.html) for more details. 


#### NOTE: Training each network takes several seconds to minutes.

In [None]:
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import accuracy_score 
import matplotlib.pyplot as plt

In [None]:
"""
For info on the arguments and attributes, see here: 
(https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)
"""

def get_mlp(hidden_layer_sizes=(100,),
            activation='relu',
            learning_rate_init=0.1,
            early_stopping=False, 
            validation_fraction=0.15):
  
  # use stochastic gradient descent
  parameters={'solver':'sgd',
              'alpha': 0,
              'momentum': 0,
              'max_iter':20000,
              'n_iter_no_change':100,
              'tol': 1e-5,
              'random_state': random_state
              }

  parameters['hidden_layer_sizes']=hidden_layer_sizes
  parameters['activation']=activation
  parameters['learning_rate_init']=learning_rate_init
  parameters['early_stopping']=early_stopping
  parameters['validation_fraction']=validation_fraction 

  return MLPClassifier(**parameters)

### Exercise 1: Warm up (2 points)

#### Use `get_mlp` defined above to create a multi-layer perceptron with 1 hidden layer consisting of 100 units and train the classifier on the training dataset. Keep all other parameters at their default values.
 

In [None]:
# insert your code here

#### Visualize the evolution of the training loss. Hint: use `loss_curve_` attribute of the classifier.





In [None]:
# insert your code here

#### Report the classifier's accuracies over the training and validation datasets. Hint: use `accuracy_score`

In [None]:
# insert your code here

#### Explain any performance difference observed between the training and validation datasets.

**ANS:**

#### We will next explore several strategies to improve the model's validation performance. 

### Exercise 2: Width vs Depth (12 points)

#### Exercise 2a (4 points)

#### Next, we will experiment with the width of the hidden layer, defined by the number of units in the hidden layer. 

#### Do this by using `get_mlp` to create a multi-layer perceptron with 1 hidden layer. Vary the number of hidden units among 1, 3, 7, 15, 25, by setting `hidden_layer_sizes`. Keep all other parameters at their default values.

#### Fit each classifier on the training dataset and report its training and validation accuracies.
 

  

In [None]:
# insert your code here

#### Provide a possible explanation for any effect observed upon increasing the number of hidden units on classifier performance.

**ANS:**

#### Exercise 2b (4 points)

#### Next, we will experiment with the depth of the MLP, by varying the number of hidden layers. 

#### Do this by using `get_mlp` to create a Multi-layer perceptron with 25 units per hidden layer. Vary the number of hidden layers from 1 through 4, by setting `hidden_layer_sizes`. Keep all other parameters at their default values.

#### Fit each classifier on the training dataset and report its training and validation accuracies.


In [None]:
# insert your code here

#### Provide a possible explanation for any change in performance upon increasing the model depth. 

**ANS:**

#### Exercise 2c (4 points)

#### Next, we'll explore the role of the hidden activation function when training a deeper network.

#### Do this by using `get_mlp` to create a multi-layer perceptron with 5 hidden layers, each with 15 hidden units. Vary the activation functions among identity, logistic, tanh, and relu. Keep all other parameters at their default values.

#### Fit each classifier on the training dataset and report its training accuracy.

#### Also, plot the training loss curves for each classifier on a single plot. 


In [None]:
# insert your code here

#### Explain any effect observed on the traininig loss trajectories and accuracies when varying the hidden activation function.

**ANS:**

### Exercise 3: Early stopping (4 points)

#### As we've seen from the above exercises, neural networks are prone to overfitting. To mitigate this, we can use a regularization method called early stopping. 

####In this part, we will compare the performance of the model with the early stopping method and the one without the early stopping method. For fair comparison, we use the validation dataset built before (20% of the data) as test dataset, and we make it unavailable to both models until finally evaluating models on it. During training both models, we assume there is only the built training dataset (80% of the data) available.

#### In early stopping, one monitors the performance of the model on a validation dataset (which is separated from the training dataset) throughout training. Then, the model with the lowest loss on the validation dataset, typically found in the earlier iterations of training, is selected, rather than the model with the lowest training loss. 




#### Do this by calling `get_mlp` and setting `early_stopping=True`, `validation_fraction=0.3`. Keep all other parameters at their default values. This will create a classifier that automatically splits the original training set into nonoverlapping training and validation splits, where the validation split is 30% of the original training set.    

#### Compare this classifier against the same model trained without early stopping.

#### Fit each classifier on the training dataset and report its training and test accuracies.

#### Also, plot the training loss and validation loss curves for the classifier trained with early stoppping. Hint: use the validation_scores_ (analogous to loss_curve_) to plot the validation loss.

In [None]:
# insert your code here

#### Explain the plot and any change in the train and test performance compared to the classifier trained without early stopping.

**ANS:**