# Anomaly Detection - Detect Timeseries Anomalies
- Time Series Anomaly Detector

Go through the lesson commenting code, adding docstrings, and adding markdown to support what is happening.

In [1]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

# from matplotlib import style
import seaborn as sns
%matplotlib inline

# pd.plotting.register_matplotlib_converters()

## 1. Acquire

In [2]:
def prepare_logs(use_cache=True):
    """This function takes in the DataFrame from the get_log_data function located in the acquire file.
    Args: none. 
    Columns dropped: 'slack', 'id', and 'deleted_at'.
    Columns renamed: mapped values for program type to integers in 'program_id' column.
    Columns converted: 'start_date', 'end_date', 'created_at', 'updated_at' converted to DTG format.
    Columns concat: 'date' and 'time', converted to DTG
    Returns: prepared DF, and CSV named 'codeup_logs.csv'
      """
      #use local cache from CSV if available
    filename = "codeup_logs.csv"
    if os.path.isfile(filename) and use_cache:
        return pd.read_csv(filename)
    # acquire the data
    df = df = acquire.get_log_data()
    # drop unnecessary columns
    df = df.drop(columns=(['slack', 'id', 'deleted_at', 'Unnamed: 0']))
    # map programs to program ids
    df.program_id = df.program_id.map({1.0:'full_stack_php', 
    2.0:'full_stack_java', 3.0:'data_science', 4.0:'front_end_programming'})
    # convert dates to DTG
    dates = ['start_date', 'end_date', 'created_at', 'updated_at']
    for col in dates:
        df[col] = pd.to_datetime(df[col])
    # change cohort names to lower case
    df.name = df.name.str.lower()
    # convert date-time to DTG
    df['date_time'] = df.date + " " + df.time
    df.date_time = pd.to_datetime(df.date_time)
    # drop unnecessary columns
    df = df.drop(columns=(['date', 'time']))
    # add 'to_csv'
    df.to_csv(filename, index=False)
    return df

In [3]:
prepare_logs()

Unnamed: 0,path,user_id,cohort_id,ip,name,start_date,end_date,created_at,updated_at,program_id,date_time
0,/,1,8.0,97.105.19.61,hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,2018-01-26 09:55:03
1,java-ii,1,8.0,97.105.19.61,hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,2018-01-26 09:56:02
2,java-ii/object-oriented-programming,1,8.0,97.105.19.61,hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,2018-01-26 09:56:05
3,slides/object_oriented_programming,1,8.0,97.105.19.61,hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,2018-01-26 09:56:06
4,javascript-i/conditionals,2,22.0,97.105.19.61,teddy,2018-01-08 00:00:00,2018-05-17 00:00:00,2018-01-08 13:59:10,2018-01-08 13:59:10,full_stack_java,2018-01-26 09:56:24
...,...,...,...,...,...,...,...,...,...,...,...
900218,jquery/personal-site,64,28.0,71.150.217.33,staff,2014-02-04 00:00:00,2014-02-04 00:00:00,2018-12-06 17:04:19,2018-12-06 17:04:19,full_stack_java,2021-04-21 16:41:51
900219,jquery/mapbox-api,64,28.0,71.150.217.33,staff,2014-02-04 00:00:00,2014-02-04 00:00:00,2018-12-06 17:04:19,2018-12-06 17:04:19,full_stack_java,2021-04-21 16:42:02
900220,jquery/ajax/weather-map,64,28.0,71.150.217.33,staff,2014-02-04 00:00:00,2014-02-04 00:00:00,2018-12-06 17:04:19,2018-12-06 17:04:19,full_stack_java,2021-04-21 16:42:09
900221,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,staff,2014-02-04 00:00:00,2014-02-04 00:00:00,2018-12-06 17:04:19,2018-12-06 17:04:19,full_stack_java,2021-04-21 16:44:37


In [8]:
# Creates dataframe and names columns with appropriate parsed field names
colnames = ['date', 'endpoint', 'user_id', 'cohort_id', 'source_ip']
df = pd.read_csv("curriculum_logs.csv", 
                 sep="\s", 
                 header=None, 
                 names = colnames, 
                 usecols=[11, 1, 2, 3, 4])
df

Unnamed: 0,endpoint,user_id,cohort_id,source_ip
0,,,,
1,"19:52:26,2016-06-14","19:52:26,,1.0",,
2,"19:52:26,2016-06-14","19:52:26,,1.0",,
3,"19:52:26,2016-06-14","19:52:26,,1.0",,
4,"19:52:26,2016-06-14","19:52:26,,1.0",,
...,...,...,...,...
900219,"17:04:19,2018-12-06","17:04:19,,2.0",,
900220,"17:04:19,2018-12-06","17:04:19,,2.0",,
900221,"17:04:19,2018-12-06","17:04:19,,2.0",,
900222,"17:04:19,2018-12-06","17:04:19,,2.0",,


## 2. Prepare

3. make the analysis process (which we will discuss later) work over all users.
    1. compute necessary metrics to arrive at the final metric, %b (percent-b).
    2. add user id to the dataframe that contains all the metrics, including %b.
    3. filter to rows where %b indicates anomaly (i.e. > 1)
    4. append rows of anomalies of new user to previous users' anomalous activity.

4. Turn the analysis process and calculations into a function that can be used to loop through for each user.

5. Test the function on a single user.

6. Analyze by looping over all users.

Bonus:

- Discover users who are accessing our curriculum pages way beyond the end of their codeup time. What would the dataframe look like? Use time series method for detecting anomalies, like exponential moving average with %b.

*Can you label students who are viewing both the web dev and data science curriculum? Can you label students by the program they are in? Can you label users by student vs. staff?