## Prepare python environment


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
random_state=5 # use this to control randomness across runs e.g., dataset partitioning

## Preparing the Credit Card Fraud Detection dataset (2 points)

The dataset contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.
See [here](https://www.kaggle.com/mlg-ulb/creditcardfraud) for details of the dataset. We will post process the data to balance both the classes indicating whether the transaction is fraud or not.

### Loading the dataset

In [None]:
# Download and load the dataset
import os
if not os.path.exists('creditcard.csv'): 
    !wget https://raw.githubusercontent.com/JHA-Lab/ece364/main/dataset/creditcard.zip
    !unzip creditcard.zip

df = pd.read_csv("creditcard.csv")
print(df.head())

In [None]:
# Check the datatype of each column
df.info()

#### There are a total of 284807 entries in this dataset with no missing values. First 30 columns are features and the last column indicates whether the transaction is fraud or not

In [None]:
##### Look at some statistics of the data using the `describe` function
df.describe()

#### Visualize the distribution of fraudulent vs genuine transactions

In [None]:
# Make a pie chart showing transaction type
fig, ax = plt.subplots(1, 1)
ax.pie(df.Class.value_counts(),autopct='%1.1f%%', labels=['Genuine','Fraud'], colors=['green','red'])
plt.axis('equal')
plt.ylabel('')

In [None]:
## Check fradulent activity over time (note: total time os 48 hours)
df["Time_Hr"] = df["Time"]/3600 # convert to hours
fig, (ax1, ax2) = plt.subplots(2, 1, sharex = True, figsize=(6,3))
ax1.hist(df.Time_Hr[df.Class==0],bins=48,color='g',alpha=0.5)
ax1.set_title('Genuine')
ax2.hist(df.Time_Hr[df.Class==1],bins=48,color='r',alpha=0.5)
ax2.set_title('Fraud')
plt.xlabel('Time (hrs)')
plt.ylabel('# transactions')

In [None]:
# Remove 'Time' feature as it is already captured when converting to hours
df = df.drop(['Time'],axis=1)

#### Create a balanced dataset with 50% from each class

In [None]:
fraud_indices = np.array(df[df.Class == 1].index) #indices corresponding to fraud transaction
genuine_ind = df[df.Class == 0].index #indices corresponding to genuine transaction
total_fraud_transactions = len(df[df.Class == 1]) # total transactions that were fraud
np.random.seed(0) # fix the random seed generator for consistent results
indices_genuine_transaction = np.random.choice(genuine_ind, total_fraud_transactions, replace = False)
indices_genuine_transaction = np.array(indices_genuine_transaction)
selected_balanced_indices = np.concatenate([fraud_indices,indices_genuine_transaction]) # indices for balanced data
balanced_data = df.iloc[selected_balanced_indices,:]

In [None]:
print("% genuine transactions: ",len(balanced_data[balanced_data.Class == 0])/len(balanced_data))
print("% fraud transactions: ",len(balanced_data[balanced_data.Class == 1])/len(balanced_data))

# Make a pie chart showing transaction type
fig, ax = plt.subplots(1, 1)
ax.pie(balanced_data.Class.value_counts(),autopct='%1.1f%%', labels=['Genuine','Fraud'], colors=['green','red'])
plt.axis('equal')
plt.ylabel('')

### Extract target and descriptive features (0.5 points)

In [None]:
# Store all the features from the data in X
X= # TODO 
# Store all the labels in y
y= # TODO

In [None]:
# Convert data to numpy array
X = # TODO 
y = # TODO

### Create training and validation datasets (0.5 points)


We will split the dataset into training and validation set. Generally in machine learning, we split the data into training,
validation and test set (this will be covered in later chapters). The model with best performance on the validation set is used to evaluate perfromance on 
the test set which is the unseen data. In this assignment, we will using `train set` for training and evaluate the performance on the `test set` for various 
model configurations to determine the best hyperparameters (parameter setting yielding the best performance).

Split the data into training and validation set using `train_test_split`.  See [here](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) for details. To get consistent result while splitting, set `random_state` to the value defined earlier. We use 80% of the data for training and 20% of the data for validation. This has been done for you.

In [None]:
X_train,X_test,y_train,y_test = # TODO 

### Preprocess the dataset (1 points)

#### Preprocess by normalizing each feature to have zero mean and unit standard deviation. This can be done using `StandardScaler()` function. See [here](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) for more details.


In [None]:
# Define the scaler for scaling the data
scaler = # TODO

# Normalize the training data
X_train = # TODO

# Use the scaler defined above to standardize the validation data by applying the same transformation to the validation data.
X_test = # TODO


## Training error-based models (18 points)


#### We will use the `sklearn` library to train a Multinomial Logistic Regression classifier and Support Vector Machines. 


### Exercise 1:  Learning a Multinomial Logistic Regression classifier (4 points)

#### Use `sklearn`'s `SGDClassifier` to train a multinomial logistic regression classifier (i.e., using a one-versus-rest scheme) with Stochastic Gradient Descent. Review ch.7 and see [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier) for more details. 

#### Set the `random_state` as defined above,  increase the `n_iter_no_change` to 1000 and `max_iter` to 10000 to facilitate better convergence.  

#### Report the model's accuracy over the training and test sets.
 

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score 

In [None]:
# TODO

#### Explain any performance difference observed between the training and test datasets.

TODO

### Exercise 2: Learning a Support Vector Machine (SVM) (14 points)

#### Use `sklearn`'s `SVC` class to train an SVM (i.e., using a [one-versus-one scheme](https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-one)). Review ch.7 and see [here](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) for more details. 
 

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#### Exercise 2a: Warm up (2 points)

#### Train an SVM with a linear kernel. Set the  random_state to the value defined above. Keep all other parameters at their defaults.

#### Report the model's accuracy over the training and test sets.

In [None]:
# TODO 


#### Exercise 2b: Evaluate a polynomial kernel function (4 points)

#### Try fitting an SVM with a polynomial kernel function and vary the degree among {1, 2, 3, 4}. Note that degree=1 yields a linear kernel. 

#### For each fitted classifier, report its accuracy over the training and test sets. 

#### As before, set the random_state to the value defined above. Set the regularization strength `C=100`.  When the data is not linearly separable, this encourages the model to fit the training data. Keep all other parameters at their default values.

In [None]:
# TODO 

#### Explain the effect of increasing the degree of the polynomial.

TO DO

#### Exercise 2c: Evaluate the radial basis kernel function (6 points)

#### Try fitting an SVM with a radial basis kernel function and vary the length-scale parameter given by $\gamma$ among {0.01, 0.1,1,10, 100}. 

#### For each fitted classifier, report its accuracy over the training and test sets. 

#### As before, set the random_state to the value defined above. Set the regularization strength `C=100`.  When the data is not linearly separable, this encourages the model to fit the training data (read more [here](https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html)). Keep all other parameters at their default values.

In [None]:
# TODO

#### Comment on the effect of increasing/reducing the length-scale parameter $\gamma$. Also, compare the performance of the classifiers trained with RBF kernel function against those trained with the polynomial and linear kernel functions (i.e., Ex. 2b). 

TO DO

#### Exercise 2d: Briefly state the main difference between the logistic regression classifier and the SVM. (2 points)

TO DO