In [1]:
# from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

## Anomaly Detection Project

### Acquire

In [2]:
## getting the dataframe from csv using seperators and the column specified form the list

colnames = ['date', 'page', 'user_id', 'cohort_id', 'source_ip']
df = pd.read_csv("anonymized-curriculum-access-07-2021.txt", 
                 sep="\s", 
                 header=None, 
                 names = colnames, 
                 usecols=[0, 2, 3, 4, 5])
df.head()

Unnamed: 0,date,page,user_id,cohort_id,source_ip
0,2018-01-26,/,1,8.0,97.105.19.61
1,2018-01-26,java-ii,1,8.0,97.105.19.61
2,2018-01-26,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,javascript-i/conditionals,2,22.0,97.105.19.61


In [3]:
df.shape ## <-- looking at our dataframe shape

(1018810, 5)

In [4]:
df.info() ## looking at our df columns and datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018810 entries, 0 to 1018809
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   date       1018810 non-null  object 
 1   page       1018809 non-null  object 
 2   user_id    1018810 non-null  int64  
 3   cohort_id  965313 non-null   float64
 4   source_ip  1018810 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 38.9+ MB


In [5]:
for col in df.columns:  ## <-- using list comprehension to look at our column values
    print(f'Value Counts For {col} Column:\n')
    print(df[col].value_counts())
    print('-------------------------------\n')

Value Counts For date Column:

2021-06-15    3357
2021-06-21    3272
2021-03-19    3104
2021-06-18    3026
2021-06-16    2562
              ... 
2018-12-29      32
2018-12-22      30
2018-12-30      21
2019-07-04      16
2018-12-23      10
Name: date, Length: 1267, dtype: int64
-------------------------------

Value Counts For page Column:

/                                          55544
search/search_index.json                   22341
javascript-i                               21330
toc                                        20543
html-css                                   15334
                                           ...  
examples/twitter.html                          1
javascript-i/math                              1
13-storytelling/2.2-create                     1
12-advanced-topics/3.3-building-a-model        1
creating-charts                                1
Name: page, Length: 2406, dtype: int64
-------------------------------

Value Counts For user_id Column:

11      1959

### Prepare

In [6]:
## preparing the dataframe by setting the date column to the index and converting 
## to date time object

df.date = pd.to_datetime(df.date)
df = df.set_index(df.date, drop = True)
df = df.drop(columns = {'date'})

df.head(3) ## <-- looking at our df (sample)

Unnamed: 0_level_0,page,user_id,cohort_id,source_ip
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-26,/,1,8.0,97.105.19.61
2018-01-26,java-ii,1,8.0,97.105.19.61
2018-01-26,java-ii/object-oriented-programming,1,8.0,97.105.19.61


In [7]:
df.isna().sum() ## <-- looking for null values

page             1
user_id          0
cohort_id    53497
source_ip        0
dtype: int64

We have over 1 million access logs in our dataframe. To save time given the project spec we are going to remove our null values because it is a small percentage of the data.

In [8]:
df = df.dropna()
df.isna().sum() ## <-- quality assurance check

page         0
user_id      0
cohort_id    0
source_ip    0
dtype: int64

### Exploring Important Questions

#### Which Lesson appears to attract the most traffic consistently across cohorts?

In [39]:
## narrowing down dataframe to look at pages with /'s because those are most likely to be 
## lessons within the curriculumn 
df_lesson = df[df.page.str.contains('/')]
df_lesson

Unnamed: 0_level_0,page,user_id,cohort_id,source_ip
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-26,/,1,8.0,97.105.19.61
2018-01-26,java-ii/object-oriented-programming,1,8.0,97.105.19.61
2018-01-26,slides/object_oriented_programming,1,8.0,97.105.19.61
2018-01-26,javascript-i/conditionals,2,22.0,97.105.19.61
2018-01-26,javascript-i/loops,2,22.0,97.105.19.61
...,...,...,...,...
2021-07-15,python/intro-to-matplotlib,11,28.0,97.105.19.60
2021-07-15,java-iii/finish-the-adlister,925,138.0,24.26.246.133
2021-07-15,java-ii/arrays,933,138.0,72.190.28.51
2021-07-15,java-ii/object-oriented-programming,933,138.0,72.190.28.51


In [40]:
## grouping by page and doind an overall count of occurences
## per page to figure which lesson has the most overall traffic
page_views = df_lesson.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,51017,993
search/search_index.json,20323,744
javascript-i/introduction/working-with-data-types-operators-and-variables,8302,659
javascript-i/javascript-with-html,8199,680
mysql/tables,7922,544
javascript-i/functions,7901,680
html-css/elements,7444,676
java-iii/jsp-and-jstl,7320,517
javascript-i/loops,7313,664
java-iii/servlets,7283,526


After narrowing down the dataframe to look at pages with only /'s because those are most likely to be lessons within the curriculumn. We can see that top most accessed lessons per program at Codeup:
 - Data Science
     - MySQL: Tables Lesson: 7922 pings
 - Software Development
     - Javascript I: Introduction Working With Data Types Operators and Variables Lesson: 8302 pings
 - Web Development
     - Html-css: Elements Lesson

#### Which Lessons Appears To Be Accessed The Least?

In [52]:
observed[observed['count'] == 3]

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
html/css,3,1
spring/fundamentals,3,1
content/examples/git,3,1
extra-features/error-pages,3,1
6-classification/6.3-random-forests,3,2
...,...,...
content/jquery/effects/show-hide-toggle.html,3,3
content/jquery/events,3,2
10-anomaly-detection/3-discrete-probabilistic-methods.ipynb,3,3
arrays/manipulating,3,3
