# Part 4: Anomaly Detection
You have also been requested to check whether there are any anomalies in the given sales dataset. The objective of this task being fraud detection.

In [32]:
# loading the dataset
df <- read.csv("Supermarket 4.csv")
head(df)

Date,Sales
1/5/2019,548.9715
3/8/2019,80.22
3/3/2019,340.5255
1/27/2019,489.048
2/8/2019,634.3785
3/25/2019,627.6165


In [33]:
# checking the shape of the dataset
dim(df)

In [34]:
# previewing class types
str(df)

'data.frame':	1000 obs. of  2 variables:
 $ Date : Factor w/ 89 levels "1/1/2019","1/10/2019",..: 27 88 82 20 58 77 49 48 2 44 ...
 $ Sales: num  549 80.2 340.5 489 634.4 ...


In [35]:
# totalling the sales based on their common shared dates
sales_agg <- aggregate(df$Sales, by = list(Date = df$Date), FUN = sum)

head(sales_agg)

Date,x
1/1/2019,4745.181
1/10/2019,3560.949
1/11/2019,2114.963
1/12/2019,5184.764
1/13/2019,2451.204
1/14/2019,3966.617


In [36]:
# getting a data frame of the frequency table of Date
date_tab <- data.frame(table(df$Date))
head(date_tab)

Var1,Freq
1/1/2019,12
1/10/2019,9
1/11/2019,8
1/12/2019,11
1/13/2019,10
1/14/2019,13


In [37]:
# combining both data frames
fin_df <- merge(sales_agg, date_tab, by.x = "Date", by.y = "Var1")

# renaming the columns
names(fin_df) <- c("Date", "Total.Sales", "count")
head(fin_df)

Date,Total.Sales,count
1/1/2019,4745.181,12
1/10/2019,3560.949,9
1/11/2019,2114.963,8
1/12/2019,5184.764,11
1/13/2019,2451.204,10
1/14/2019,3966.617,13


In [54]:
library(lubridate)
library(tibbletime)


Attaching package: ‘tibbletime’

The following object is masked from ‘package:stats’:

    filter



In [47]:
# changing the Date column to Date format
fin_df$Date <- mdy(fin_df$Date)
str(fin_df)

'data.frame':	89 obs. of  3 variables:
 $ Date       : Date, format: "2019-01-01" "2019-01-10" ...
 $ Total.Sales: num  4745 3561 2115 5185 2451 ...
 $ count      : int  12 9 8 11 10 13 13 10 11 9 ...


In [60]:
fin_df$Date <- as_tbl_time(fin_df, index = 'Date')
str(fin_df$Date)

Classes ‘tbl_time’, ‘tbl_df’, ‘tbl’ and 'data.frame':	89 obs. of  3 variables:
 $ Date       : Date, format: "2019-01-01" "2019-01-10" ...
 $ Total.Sales: num  4745 3561 2115 5185 2451 ...
 $ count      : int  12 9 8 11 10 13 13 10 11 9 ...
 - attr(*, "index_quo")= language ~"Date"
  ..- attr(*, ".Environment")=<environment: R_EmptyEnv> 
 - attr(*, "index_time_zone")= chr "UTC"


In [63]:
class(fin_df)

In [48]:
# check for duplicates
anyDuplicated(fin_df)

In [49]:
# check for missing values
colSums(is.na(fin_df))

In [42]:
# loading relevant libraries to aid in anomaly detection
library(tidyverse)
library(anomalize)

In [62]:
fin_df %>%
    time_decompose(count) %>%
    anomalize(remainder) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.5)

ERROR: Error: Error time_decompose(): Object is not of class `tbl_df` or `tbl_time`.
