### Anomoly Detection: Group Project

    Created By: Mijail Mariano & Chen Chen Feng

    23AUGUST2022

In [23]:
# notebook dependencies
%matplotlib inline
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 300

import pandas as pd
import numpy as np
import os

# visualization libraries/modules
import matplotlib.pyplot as plt
import seaborn as sns

# pycaret import
from pycaret.anomaly import *

# plotly import
# import plotly.express as px
# import plotly.io as pio
# pio.renderers.default = "notebook_connected"

# created module
import mm_prepare
from mm_prepare import get_logs_dataset

# skimpy module to clean column names
from skimpy import clean_columns

# regular expression module
import re

import env_mm
from env_mm import user, password, host, get_connection

In [24]:
# importing the data

df = get_logs_dataset()
df.head()

df shape: (900223, 11)


Unnamed: 0,date,time,endpoint,user_id,cohort_id,ip,name,slack,start_date,end_date,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2.0


In [25]:
# dataframe information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900223 entries, 0 to 900222
Data columns (total 11 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   endpoint    900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   cohort_id   847330 non-null  float64
 5   ip          900223 non-null  object 
 6   name        847330 non-null  object 
 7   slack       847330 non-null  object 
 8   start_date  847330 non-null  object 
 9   end_date    847330 non-null  object 
 10  program_id  847330 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 75.5+ MB


In [26]:
'''function that returns the endpoint class and topic'''
def get_endpoint_targets(df):

    topics = df["endpoint"]. \
        str. \
        split(
                "/", 
                n = 1, 
                expand = True). \
                rename(columns = {0: "class", 1: "topic"})
    
    new_df = pd.concat([df, topics], axis = 1)

    # returns the new df w/endpoint class and topics
    return new_df

In [27]:
df["endpoint"].str.split("/", n = 1, expand = True).rename(columns = {0: "class", 1: "topic"})

Unnamed: 0,class,topic
0,,
1,java-ii,
2,java-ii,object-oriented-programming
3,slides,object_oriented_programming
4,javascript-i,conditionals
...,...,...
900218,jquery,personal-site
900219,jquery,mapbox-api
900220,jquery,ajax/weather-map
900221,anomaly-detection,discrete-probabilistic-methods


In [28]:
# testing out the function 

df = mm_prepare.get_endpoint_targets(df)
df.head()

df shape: (900223, 13)


Unnamed: 0,date,time,endpoint,user_id,cohort_id,ip,name,slack,start_date,end_date,program_id,class,topic
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,,
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,java-ii,
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,java-ii,object-oriented-programming
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,slides,object_oriented_programming
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2.0,javascript-i,conditionals


In [29]:
# expressed another way
# str.split() method to expand endpoints on "/"

df_test = pd.DataFrame()

df_test[["class", "topic"]] = df["endpoint"].str.split("/", n = 1, expand = True)
df_test.head()

Unnamed: 0,class,topic
0,,
1,java-ii,
2,java-ii,object-oriented-programming
3,slides,object_oriented_programming
4,javascript-i,conditionals


In [30]:
# using the clean date function

df = mm_prepare.clean_dates(df)
df.head()

new df shape: (900223, 12)


Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,program_id,class,topic,day,month
2018-01-26 09:55:03,/,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,,,Friday,January
2018-01-26 09:56:02,java-ii,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,java-ii,,Friday,January
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,java-ii,object-oriented-programming,Friday,January
2018-01-26 09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,1.0,slides,object_oriented_programming,Friday,January
2018-01-26 09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,2.0,javascript-i,conditionals,Friday,January


In [31]:
# initial data familiarization

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 900223 entries, 2018-01-26 09:55:03 to 2021-04-21 16:44:39
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   endpoint    900222 non-null  object 
 1   user_id     900223 non-null  int64  
 2   ip          900223 non-null  object 
 3   name        847330 non-null  object 
 4   slack       847330 non-null  object 
 5   start_date  847330 non-null  object 
 6   end_date    847330 non-null  object 
 7   program_id  847330 non-null  float64
 8   class       900222 non-null  object 
 9   topic       731934 non-null  object 
 10  day         900223 non-null  object 
 11  month       900223 non-null  object 
dtypes: float64(1), int64(1), object(10)
memory usage: 89.3+ MB


In [32]:
# setting the program_id to object type

df[["user_id", "program_id"]] = df[["user_id", "program_id"]].astype(object)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 900223 entries, 2018-01-26 09:55:03 to 2021-04-21 16:44:39
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   endpoint    900222 non-null  object
 1   user_id     900223 non-null  object
 2   ip          900223 non-null  object
 3   name        847330 non-null  object
 4   slack       847330 non-null  object
 5   start_date  847330 non-null  object
 6   end_date    847330 non-null  object
 7   program_id  847330 non-null  object
 8   class       900222 non-null  object
 9   topic       731934 non-null  object
 10  day         900223 non-null  object
 11  month       900223 non-null  object
dtypes: object(12)
memory usage: 89.3+ MB


In [33]:
# mapping the codeup program type by program_id

df = mm_prepare.map_program_id(df)
df.head()

Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,class,topic,day,month,program_type
2018-01-26 09:55:03,/,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,,,Friday,January,FS_PHP_program
2018-01-26 09:56:02,java-ii,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,java-ii,,Friday,January,FS_PHP_program
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,java-ii,object-oriented-programming,Friday,January,FS_PHP_program
2018-01-26 09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,slides,object_oriented_programming,Friday,January,FS_PHP_program
2018-01-26 09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,javascript-i,conditionals,Friday,January,FS_JAVA_program


In [34]:
# checking the number of missing values per column 

df.isnull().sum().sort_values(ascending = False)

topic           168289
name             52893
slack            52893
start_date       52893
end_date         52893
program_type     52893
endpoint             1
class                1
user_id              0
ip                   0
day                  0
month                0
dtype: int64

In [35]:
# let's view the only (1) missing values in endpoint and class

df[df["endpoint"].isnull()]

Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,class,topic,day,month,program_type
2020-04-08 09:25:18,,586,72.177.240.51,Curie,#curie,2020-02-03,2020-07-07,,,Wednesday,April,DS_program


In [36]:
# let's view the only (1) missing values in endpoint and class

df[df["class"].isnull()]

Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,class,topic,day,month,program_type
2020-04-08 09:25:18,,586,72.177.240.51,Curie,#curie,2020-02-03,2020-07-07,,,Wednesday,April,DS_program


In [37]:
# let's examing user_id 586

df_586 = df[df["user_id"] == 586]
df_586.shape

(983, 12)

In [38]:
# how many ip addresses has this user_id used?

df_586["ip"].nunique()

40

In [39]:
# what unique endpoint-classes have they looked at?

pd.Series(df_586["class"].unique().tolist())

0                         
1                 appendix
2             6-regression
3           1-fundamentals
4           2-storytelling
5                    3-sql
6                   search
7                 4-python
8                  5-stats
9                      NaN
10        7-classification
11    10-anomaly-detection
12      13-advanced-topics
13            8-clustering
14       12-distributed-ml
15            9-timeseries
16                  11-nlp
17            fundamentals
18                     sql
19                  python
20          classification
21              timeseries
22       anomaly-detection
23                     nlp
24              regression
25              clustering
26         advanced-topics
27          distributed-ml
dtype: object

In [40]:
# cleaning columns with empty class and topic observations

df["class"] = df["class"].map({"": np.nan, "": np.nan})
df["topic"] = df["topic"].map({"": np.nan, "": np.nan})

df[df["class"].isnull()]

Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,class,topic,day,month,program_type
2018-01-26 09:55:03,/,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,,,Friday,January,FS_PHP_program
2018-01-26 09:56:02,java-ii,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,,,Friday,January,FS_PHP_program
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,,,Friday,January,FS_PHP_program
2018-01-26 09:56:06,slides/object_oriented_programming,1,97.105.19.61,Hampton,#hampton,2015-09-22,2016-02-06,,,Friday,January,FS_PHP_program
2018-01-26 09:56:24,javascript-i/conditionals,2,97.105.19.61,Teddy,#teddy,2018-01-08,2018-05-17,,,Friday,January,FS_JAVA_program
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-21 16:41:51,jquery/personal-site,64,71.150.217.33,Staff,#,2014-02-04,2014-02-04,,,Wednesday,April,FS_JAVA_program
2021-04-21 16:42:02,jquery/mapbox-api,64,71.150.217.33,Staff,#,2014-02-04,2014-02-04,,,Wednesday,April,FS_JAVA_program
2021-04-21 16:42:09,jquery/ajax/weather-map,64,71.150.217.33,Staff,#,2014-02-04,2014-02-04,,,Wednesday,April,FS_JAVA_program
2021-04-21 16:44:37,anomaly-detection/discrete-probabilistic-methods,744,24.160.137.86,Staff,#,2014-02-04,2014-02-04,,,Wednesday,April,FS_JAVA_program


In [41]:
df.shape

(900223, 12)

In [54]:
# drop single missing values in endpoint and class

mask = df[(df["endpoint"].isnull()) & (df["class"].isnull())]

In [55]:
# what percentage of missing values makeup ea. column/feature?

df.isnull().mean().sort_values(ascending = False).round(3)

class           1.000
topic           1.000
name            0.059
slack           0.059
start_date      0.059
end_date        0.059
program_type    0.059
endpoint        0.000
user_id         0.000
ip              0.000
day             0.000
month           0.000
dtype: float64

----

### ``Analyzing Missing Values``

In [None]:
# let's examine missing values 

col_lst = [col for col in df.columns if "topic" not in col]

null_df = df[col_lst].loc[df[col_lst].isnull().any(axis = 1)]
null_df.head()

Unnamed: 0,endpoint,user_id,ip,name,slack,start_date,end_date,class,day,month,program_type
2018-01-26 16:46:16,/,48,97.105.19.61,,,,,,Friday,January,
2018-01-26 16:46:24,spring/extra-features/form-validation,48,97.105.19.61,,,,,spring,Friday,January,
2018-01-26 17:54:24,/,48,97.105.19.61,,,,,,Friday,January,
2018-01-26 18:32:03,/,48,97.105.19.61,,,,,,Friday,January,
2018-01-26 18:32:17,mysql/relationships/joins,48,97.105.19.61,,,,,mysql,Friday,January,


In [None]:
# let's examine unique values in ea. feature/vairable

mm_prepare.print_variable_info(null_df)

feature: endpoint
feature type: object
number of unique values: 1112
unique values: ['/' 'spring/extra-features/form-validation' 'mysql/relationships/joins'
 ... 'classification/classical_programming_vs_machine_learning.jpeg'
 'distributed-ml/explore' 'appendix/further-reading/jquery/effects']
value counts: /                                                                                           4459
search/search_index.json                                                                    1985
javascript-i                                                                                 780
toc                                                                                          706
spring                                                                                       641
java-iii                                                                                     567
html-css                                                                                     508
java-ii     

In [None]:
# creating a dataframe to capture the highest feature frequency

container = []

for col in null_df.columns:
    
    if col == "endpoint":
        
        metric = {  
            "feature": col,
            "data_type": null_df[col].dtype,
            "unique_values": null_df[col].nunique(),
            "most_freq_observation": null_df[col].value_counts().index[1],
            "total_observations": null_df[col].value_counts()[1].max()
        }

        container.append(metric)

    else:
        metric = {  
            "feature": col,
            "data_type": null_df[col].dtype,
            "unique_values": null_df[col].nunique(),
            "most_freq_observation": null_df[col].value_counts().idxmax(),
            "total_observations": null_df[col].value_counts().max()
        }

        container.append(metric)

freq_df = pd.DataFrame(container).sort_values("total_observations", ascending = False).reset_index(drop = True)
freq_df

ValueError: attempt to get argmax of an empty sequence

In [None]:
# Who is using ip addres "97.105.19.58" ?

null_df[null_df["ip"] == "97.105.19.58"]["user_id"].unique()

array([58, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360,
       361, 362, 363, 364, 365, 366, 367, 368, 369, 372, 375, 403, 406,
       429], dtype=object)

In [None]:
# who is the most frequent user of this ip address?

null_df[null_df["ip"] == "97.105.19.58"]["user_id"].value_counts()

354    2065
363    1876
368    1261
355    1208
349    1162
366     895
367     852
353     791
362     735
351     671
352     671
361     590
357     556
372     485
364     465
359     454
369     353
365     195
360     194
403     183
358      78
375      65
350      60
58       23
356      21
429      13
406       9
Name: user_id, dtype: int64

----
#### 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?