# Anomaly Detection in R

# Mary Donovan Martello

## The goal of this project was to use R to design unsupervised predictive binary classification models to predict whether credit card transactions are fraudulent transactions.  This file includes exploratory data analysis.

## Part 1:  Exploratory Data Analysis

# NOTE:  FILE WAS TOO BIG TO UPLOAD TO GITHUB WITH OUTPUTS INCLUDED.  THEREFORE OUTPUTS WERE CLEARED.

In [None]:
# Importing required libraries
library(dplyr)
library(caret)
library(ggplot2)
library(caTools)
library(ROSE)
library(smotefamily)
library(rpart)
library(rpart.plot)
library(psych)
library(ltm)
library(corrplot)
library(e1071)

suppressMessages(library(dplyr))
suppressMessages(library(caTools))
suppressMessages(library(ROSE))
suppressMessages(library(smotefamily))
suppressMessages(library(rpart.plot))
suppressMessages(library(psych))
suppressMessages(library(ltm))
suppressMessages(library(corrplot))
suppressMessages(library(e1071))

In [None]:
# turns off scientific notation
options(scipen = 999)

### The dataset contains transactions made by credit cards in September 2013 by European cardholders. For privacy purposes, 28 of the 31 features are the result of a PCA transformation.  The three non-PCA features are 'Time', 'Amount', and the target feature ‘Class' (1 in case of fraud and 0 for non-fraud). Time' contains the seconds elapsed between each transaction and the first transaction in the dataset and 'Amount' is the transaction amount. As a result of the PCA transformation of 28 features, the actual nature of these features cannot be understood, but these features are independent of each other due to PCA transformation.

In [None]:
#Loading the dataset
fulldf<-read.csv("creditcard.csv")

In [None]:
#Viewing dataset
str(fulldf)

In [None]:
# Convert class to a factor variable
fulldf$Class <- factor(fulldf$Class, levels =  c(0,1))

In [None]:
fulldf%>%head()

In [None]:
# Number of rows & columns
nrow(fulldf)
ncol(fulldf)

In [None]:
# Get the summary of the data
summary(fulldf)

In [None]:
psych::describe(fulldf)%>%as_tibble()

In [None]:
# Count the missing values
sum(is.null(fulldf))

In [None]:
# Get the distribution of fraud and non-fraud transactions in the dataset
table(fulldf$Class)

In [None]:
# Get the percentage of fraud and legit transactions in the datasets
prop.table(table(fulldf$Class))

### Each record in the dataset includes a label of whether the claim was classified as fraudulent (fraud = 1) or as non-fraudulent (fraud = 0). Create separate subsets of the dataset so that one subset only has records that did not have fraud transactions and another subset that only has records with fraud transactions. Use these separate subsets for EDA.

In [None]:
# Subsets of fraud vs non-fraud 
noFraud <- subset(fulldf, Class == 0)
yesFraud <- subset(fulldf, Class == 1)



In [None]:
dim(noFraud) #284807 .99827

In [None]:
dim(yesFraud) #.00172749

### Examine Feature Counts

In [None]:
ggplot(fulldf, aes(x = Time, fill = Class)) +
    geom_histogram(bins = 25) +
    facet_wrap( ~ Class, scales = "free", ncol = 2) + labs(title = "Fraud Subset Time Counts")

In [None]:
ggplot(fulldf, aes(x = Time, fill = Class)) +
    geom_histogram(bins = 100) +
    facet_wrap( ~ Class, scales = "free", ncol = 2) + labs(title = "Fraud Subset Time Counts")

In [None]:
# get counts of non-PCA vars: fraud subset
yesFraud %>% count(Time)

In [None]:
# get counts of non-PCA vars: non-fraud subset
noFraud %>% count(Time)

In [None]:
ggplot(fulldf, aes(x = Amount, fill = Class)) +
    geom_histogram(bins = 25) +
    facet_wrap( ~ Class, scales = "free", ncol = 2) + labs(title = "Fraud Subset Amount Counts")

In [None]:
ggplot(fulldf, aes(x = Amount, fill = Class)) +
    geom_histogram(bins = 100) +
    facet_wrap( ~ Class, scales = "free", ncol = 2) + labs(title = "Fraud Subset Amount Counts")

In [None]:
# get counts of non-PCA vars: fraud subset
yesFraud %>% count(Amount)

In [None]:
# get counts of non-PCA vars: non-fraud subset
noFraud %>% count(Amount)

In [None]:
# Get the distribution of Time values in the dataset
table(fulldf$Time)

In [None]:
# Get the distribution of Amount values in the dataset
table(fulldf$Amount)

### Visualizations

**Histograms**

In [None]:
for (i in fulldf)
{
for (j in colnames(fulldf))
{
    print(ggplot(fulldf, aes(x ="",y=i, fill=Class))+ geom_boxplot()+labs(title=j,x=i,y=""))
}
}

**The boxplots show that the Time feature may have outliers. If the zscore is > 3 or < -3 then remove as an outlier. Need to remove from the full df and the two fraud subsets.**

**However,the z score calculation shows that there are no z scores > 2 or < -2.**

In [None]:
# calculate the z score
zscoreTimetest <- (zdf$Time - mean(zdf$Time)) / sd(zdf$Time)
zscoreTimetest 

In [None]:
zScoreNeg2 <- subset(zscoreTimetest, zscoreTimetest < -2)

In [None]:
zScoreNeg2

In [None]:
zScorePos2 <- subset(zscoreTimetest, zscoreTimetest > 2)

In [None]:
zScorePos2

### Visualizations continued.

In [None]:
for (i in fulldf){
    print(ggplot(fulldf, aes(title=i, x = i, fill = Class)) +
    geom_histogram(bins = 25) +
    facet_wrap( ~ Class, scales = "free", ncol = 2))
}

In [None]:
# Histogram with normal curve colored by fraud variable
for (i in fulldf){
    print(ggplot(fulldf, aes(x=i, col = Class)) + 
    geom_histogram(binwidth = 100, aes(y=..density..), colour="black", fill="white") +
    geom_density(alpha=.2, fill="yellow") +
    labs(title=i, x=i, y = "Count"))
}

In [None]:
for (i in fulldf){
    print(ggplot(fulldf, aes(title=i, x = i, fill = Class)) +
    geom_density(alpha=1/3) + scale_fill_hue())
}

### Correlation

In [None]:
#point-biserial correlation = Computes the point-biserial correlation between a dichotomous and a continuous variable
timeCorr = biserial.cor(fulldf$Time, fulldf$Class, use = "complete.obs", level = 2)
amtCorr = biserial.cor(fulldf$Amount, fulldf$Class, use = "complete.obs", level = 2)
V1Corr = biserial.cor(fulldf$V1, fulldf$Class, use = "complete.obs", level = 2)
V2Corr = biserial.cor(fulldf$V2, fulldf$Class, use = "complete.obs", level = 2)
V3Corr = biserial.cor(fulldf$V3, fulldf$Class, use = "complete.obs", level = 2)
V4Corr = biserial.cor(fulldf$V4, fulldf$Class, use = "complete.obs", level = 2)
V5Corr = biserial.cor(fulldf$V5, fulldf$Class, use = "complete.obs", level = 2)
V6Corr = biserial.cor(fulldf$V6, fulldf$Class, use = "complete.obs", level = 2)
V7Corr = biserial.cor(fulldf$V7, fulldf$Class, use = "complete.obs", level = 2)
V8Corr = biserial.cor(fulldf$V8, fulldf$Class, use = "complete.obs", level = 2)
V9Corr = biserial.cor(fulldf$V9, fulldf$Class, use = "complete.obs", level = 2)
V10Corr = biserial.cor(fulldf$V10, fulldf$Class, use = "complete.obs", level = 2)
V11Corr = biserial.cor(fulldf$V11, fulldf$Class, use = "complete.obs", level = 2)
V12Corr = biserial.cor(fulldf$V12, fulldf$Class, use = "complete.obs", level = 2)
V13Corr = biserial.cor(fulldf$V13, fulldf$Class, use = "complete.obs", level = 2)
V14Corr = biserial.cor(fulldf$V14, fulldf$Class, use = "complete.obs", level = 2)
V15Corr = biserial.cor(fulldf$V15, fulldf$Class, use = "complete.obs", level = 2)
V16Corr = biserial.cor(fulldf$V16, fulldf$Class, use = "complete.obs", level = 2)
V17Corr = biserial.cor(fulldf$V17, fulldf$Class, use = "complete.obs", level = 2)
V18Corr = biserial.cor(fulldf$V18, fulldf$Class, use = "complete.obs", level = 2)
V19Corr = biserial.cor(fulldf$V19, fulldf$Class, use = "complete.obs", level = 2)
V20Corr = biserial.cor(fulldf$V20, fulldf$Class, use = "complete.obs", level = 2)
V21Corr = biserial.cor(fulldf$V21, fulldf$Class, use = "complete.obs", level = 2)
V22Corr = biserial.cor(fulldf$V22, fulldf$Class, use = "complete.obs", level = 2)
V23Corr = biserial.cor(fulldf$V23, fulldf$Class, use = "complete.obs", level = 2)
V24Corr = biserial.cor(fulldf$V24, fulldf$Class, use = "complete.obs", level = 2)
V25Corr = biserial.cor(fulldf$V25, fulldf$Class, use = "complete.obs", level = 2)
V26Corr = biserial.cor(fulldf$V26, fulldf$Class, use = "complete.obs", level = 2)
V27Corr = biserial.cor(fulldf$V27, fulldf$Class, use = "complete.obs", level = 2)
V28Corr = biserial.cor(fulldf$V28, fulldf$Class, use = "complete.obs", level = 2)



In [None]:
features =c("Time", "Amount", "V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10",
            "V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20",
            "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V28")

pbCorr = c(timeCorr, amtCorr, V1Corr, V2Corr, V3Corr, V4Corr, V5Corr, V6Corr, V7Corr, V8Corr, V9Corr, V10Corr,
           V11Corr, V12Corr, V13Corr, V14Corr, V15Corr, V16Corr, V17Corr, V18Corr, V19Corr, V20Corr,
           V21Corr, V22Corr, V23Corr, V24Corr, V25Corr, V26Corr, V27Corr, V28Corr)

In [None]:
# point-biserial correlation:  Full dataset
corrDf <- data.frame(features = features, pointBiseralCorr = pbCorr, stringsAsFactors = FALSE)
corrDf

In [None]:
# subset df to only numeric variables (remove Class variable)
numDf <- fulldf[, c(1:30)]
numDfYes <- yesFraud[, c(1:30)]
numDfNo <- noFraud[, c(1:30)]

In [None]:
# correlation matrix
#http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software

# Compute correlation matrix
res <- cor(numDf, method = "pearson", use = "complete.obs")
resYes <- cor(numDfYes, method = "pearson", use = "complete.obs")
resNo <- cor(numDfNo, method = "pearson", use = "complete.obs")

In [None]:
write.csv(resYes,"C:\\Users\\trave\\1_DSC680_Project3\\corrFraudYes.csv", row.names = TRUE)

In [None]:
# full df
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

In [None]:
# fraud df
corrplot(resYes, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

In [None]:
# no fraud df
corrplot(resNo, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

In [None]:
# heatmap full df
# Get some colors
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = res, col = col, symm = TRUE)

In [None]:
# heatmap fraud df
# Get some colors
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = resYes, col = col, symm = TRUE)

In [None]:
# heatmap non-fraud df
# Get some colors
col<- colorRampPalette(c("blue", "white", "red"))(20)
heatmap(x = resNo, col = col, symm = TRUE)

In [None]:
# fraud df correlation
round(resYes, 2)

In [None]:
# Calculate correlation matrix
#cor_matrix <- cor(num_df)
# Find attributes that are highly corrected
highlyCorrelated <- findCorrelation(res, cutoff=0.7)
# Identifying Variable Names of Highly Correlated Variables
highlyCorColumns <- colnames(numDf)[highlyCorrelated]
# Print highly correlated attributes
highlyCorColumns


In [None]:
# Find attributes that are highly corrected
highlyCorrelatedYes <- findCorrelation(resYes, cutoff=0.7)
# Identifying Variable Names of Highly Correlated Variables
highlyCorYesColumns <- colnames(numDfYes)[highlyCorrelatedYes]
# Print highly correlated attributes
highlyCorYesColumns

In [None]:
# Remove highly correlated variables and create a new dataset
college_df <- college_df[, -which(colnames(college_df) %in% highlyCorCol)]
dim(college_df)
