In [1]:
# from __future__ import division
import itertools
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import numpy as np
import pandas as pd
import math
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

from env import host, user, password, get_connection

### Functions To Be Moved to .py Files Later

In [2]:
def prep(df, user):
    df = df[df.user_id == user]
    df.date = pd.to_datetime(df.date)
    df = df.set_index(df.date)
    pages = df['page'].resample('d').count()
    return pages

def compute_pct_b(pages, span, weight, user):
    midband = pages.ewm(span=span).mean()
    stdev = pages.ewm(span=span).std()
    ub = midband + stdev*weight
    lb = midband - stdev*weight
    bb = pd.concat([ub, lb], axis=1)
    my_df = pd.concat([pages, midband, bb], axis=1)
    my_df.columns = ['pages', 'midband', 'ub', 'lb']
    my_df['pct_b'] = (my_df['pages'] - my_df['lb'])/(my_df['ub'] - my_df['lb'])
    my_df['user_id'] = user
    return my_df

def plt_bands(my_df, user):
    fig, ax = plt.subplots(figsize=(12,8))
    ax.plot(my_df.index, my_df.pages, label='Number of Pages, User: '+str(user))
    ax.plot(my_df.index, my_df.midband, label = 'EMA/midband')
    ax.plot(my_df.index, my_df.ub, label = 'Upper Band')
    ax.plot(my_df.index, my_df.lb, label = 'Lower Band')
    ax.legend(loc='best')
    ax.set_ylabel('Number of Pages')
    plt.show()

def find_anomalies(df, user, span, weight):
    pages = prep(df, user)
    my_df = compute_pct_b(pages, span, weight, user)
    # plt_bands(my_df, user)
    return my_df[my_df.pct_b>1]

## Anomaly Detection Project

### Acquire

In [3]:
## getting the dataframe from csv using seperators and the column specified form the list

colnames = ['date', 'time', 'page', 'user_id', 'cohort_id', 'source_ip']
df = pd.read_csv("anonymized-curriculum-access-07-2021.txt", 
                 sep="\s", 
                 header=None, 
                 names = colnames, 
                 usecols=[0, 1, 2, 3, 4, 5])
df.head()

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [4]:
df.shape ## <-- looking at our dataframe shape

(1018810, 6)

In [5]:
df.info() ## looking at our df columns and datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018810 entries, 0 to 1018809
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   date       1018810 non-null  object 
 1   time       1018810 non-null  object 
 2   page       1018809 non-null  object 
 3   user_id    1018810 non-null  int64  
 4   cohort_id  965313 non-null   float64
 5   source_ip  1018810 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 46.6+ MB


In [6]:
for col in df.columns:  ## <-- using list comprehension to look at our column value counts
    print(f'Value Counts For {col} Column:\n')
    print(df[col].value_counts())
    print('-------------------------------\n')

Value Counts For date Column:

2021-06-15    3357
2021-06-21    3272
2021-03-19    3104
2021-06-18    3026
2021-06-16    2562
              ... 
2018-12-29      32
2018-12-22      30
2018-12-30      21
2019-07-04      16
2018-12-23      10
Name: date, Length: 1267, dtype: int64
-------------------------------

Value Counts For time Column:

09:03:45    85
09:03:00    84
09:01:59    82
09:05:45    82
09:02:45    81
            ..
01:23:40     1
00:22:18     1
06:01:01     1
02:11:12     1
06:32:06     1
Name: time, Length: 74592, dtype: int64
-------------------------------

Value Counts For page Column:

/                                                        55544
search/search_index.json                                 22341
javascript-i                                             21330
toc                                                      20543
html-css                                                 15334
                                                         ...  
4-stats/2.

### Prepare

In [7]:
## preparing the dataframe by setting the date column to the index and converting 
## to date time object

df.date = pd.to_datetime(df.date)



df.head(3) ## <-- looking at our df (sample)

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61


In [8]:
df.isna().sum() ## <-- looking for null values

date             0
time             0
page             1
user_id          0
cohort_id    53497
source_ip        0
dtype: int64

We have over 1 million access logs in our dataframe. To save time given the project spec we are going to remove our null values because it is a small percentage of the data.

In [9]:
df = df.dropna()
df.isna().sum() ## <-- quality assurance check

date         0
time         0
page         0
user_id      0
cohort_id    0
source_ip    0
dtype: int64

### Exploring Important Questions

#### Which Lesson appears to attract the most traffic consistently across cohorts?

In [10]:
## narrowing down dataframe to look at pages with /'s because those are most likely to be 
## lessons within the curriculumn 
df_lesson = df[df.page.str.contains('/')]
df_lesson

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61
5,2018-01-26,09:56:41,javascript-i/loops,2,22.0,97.105.19.61
...,...,...,...,...,...,...
1018804,2021-07-15,13:48:38,python/intro-to-matplotlib,11,28.0,97.105.19.60
1018806,2021-07-15,13:49:06,java-iii/finish-the-adlister,925,138.0,24.26.246.133
1018807,2021-07-15,13:51:23,java-ii/arrays,933,138.0,72.190.28.51
1018808,2021-07-15,13:53:06,java-ii/object-oriented-programming,933,138.0,72.190.28.51


In [11]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic
page_views = df_lesson.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,51017,993
search/search_index.json,20323,744
javascript-i/introduction/working-with-data-types-operators-and-variables,8302,659
javascript-i/javascript-with-html,8199,680
mysql/tables,7922,544
javascript-i/functions,7901,680
html-css/elements,7444,676
java-iii/jsp-and-jstl,7320,517
javascript-i/loops,7313,664
java-iii/servlets,7283,526


#### High Traffic Lesson Per Program Takeaways

After narrowing down the dataframe to look at pages with only /'s because those are most likely to be lessons within the curriculumn. We can see that top most accessed lessons per program at Codeup:
 - Data Science
     - MySQL: Tables Lesson: 7922 pings
 - Software Development
     - Javascript I: Introduction Working With Data Types Operators and Variables Lesson: 8302 pings
 - Web Development
     - Javasvript I: Javascrip-with-html : 8199 pings

#### Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?

In [12]:
df.head(3) ## <-- looking at our dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61


In [13]:
## Testing on a single user

user = 1
span = 30
weight = 6
user_df = find_anomalies(df, user, span, weight)

anomalies = pd.DataFrame()
user_df = find_anomalies(df, user, span, weight)
anomalies = pd.concat([anomalies, user_df], axis=0)

In [14]:
## looping through all the users 

span = 30
weight = 3.5

anomalies = pd.DataFrame()
for u in list(df.user_id.unique()):
    user_df = find_anomalies(df, u, span, weight)
    anomalies = pd.concat([anomalies, user_df], axis=0)

In [15]:
## this value counts shows us the number of users on the right that have accessed 
## the value of pages on the left in total

anomalies.pages.value_counts(sort=False) 

1      50
2      38
3      77
4      84
5      56
       ..
179     1
192     1
198     1
272     1
343     1
Name: pages, Length: 103, dtype: int64

In [16]:
anomalies[anomalies.pages==179] ## finding the specific user id for who accessed 179 pages

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-16,179,20.158062,173.720239,-133.404115,1.017191,658


In [17]:
df[df.user_id==658] ## looking at the data frame for user 658

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
553669,2020-05-27,09:13:05,/,658,58.0,100.19.73.35
553677,2020-05-27,09:13:10,toc,658,58.0,100.19.73.35
553756,2020-05-27,09:17:13,html-css,658,58.0,100.19.73.35
554330,2020-05-27,13:19:31,html-css/introduction,658,58.0,100.19.73.35
554468,2020-05-27,14:17:07,html-css/elements,658,58.0,100.19.73.35
...,...,...,...,...,...,...
980731,2021-06-17,12:25:04,appendix,658,58.0,172.58.96.174
980732,2021-06-17,12:25:09,appendix/professional-development/cover-letter,658,58.0,172.58.96.174
985401,2021-06-20,23:22:36,/,658,58.0,24.243.74.160
985402,2021-06-20,23:27:53,javascript-i,658,58.0,24.243.74.160


In [18]:
anomalies[anomalies.pages==192] ## finding the specific user id for who accessed 192 pages

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-12-19,192,19.601776,187.231096,-148.027544,1.014225,526


In [19]:
df[df.user_id==526] ## looking at the data frame for user 526

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
388205,2019-11-04,16:03:38,/,526,52.0,97.105.19.58
388389,2019-11-04,16:53:30,html-css,526,52.0,97.105.19.58
389238,2019-11-05,14:18:00,html-css/introduction,526,52.0,97.105.19.58
389350,2019-11-05,15:04:01,html-css/elements,526,52.0,97.105.19.58
389577,2019-11-05,21:47:17,toc,526,52.0,172.124.70.146
...,...,...,...,...,...,...
507944,2020-04-09,13:33:40,spring/fundamentals/views,526,52.0,172.124.70.146
510647,2020-04-13,18:04:02,/,526,52.0,172.124.70.146
510648,2020-04-13,18:04:08,spring,526,52.0,172.124.70.146
510649,2020-04-13,18:04:16,spring/extra-features/file-upload,526,52.0,172.124.70.146


In [20]:
anomalies[anomalies.pages==198] ## finding the specific user id for who accessed 198 pages

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-06-02,198,16.484863,188.766965,-155.797239,1.026796,138


In [21]:
df[df.user_id==138] ## looking at the data frame for user 138

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
21660,2018-03-09,09:33:19,slides/annotations,138,22.0,97.105.19.61
21666,2018-03-09,09:36:13,java-ii/annotations,138,22.0,97.105.19.61
21677,2018-03-09,09:45:18,mkdocs/search_index.json,138,22.0,97.105.19.61
21678,2018-03-09,09:45:33,appendix,138,22.0,97.105.19.61
21679,2018-03-09,09:45:39,appendix/git/intellij,138,22.0,97.105.19.61
...,...,...,...,...,...,...
350035,2019-09-16,20:57:51,/,138,22.0,108.65.244.91
350036,2019-09-16,20:58:02,/,138,22.0,108.65.244.91
350037,2019-09-16,20:58:37,1-fundamentals/1.1-intro-to-data-science,138,22.0,108.65.244.91
350038,2019-09-16,20:58:37,1-fundamentals/AI-ML-DL-timeline.jpg,138,22.0,108.65.244.91


In [22]:
anomalies[anomalies.pages==272] ## finding the specific user id for who accessed 272 pages

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-03-03,272,24.721632,266.780128,-217.336864,1.010782,341


In [23]:
df[df.user_id==341] ## looking at the data frame for user 341

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
181808,2019-01-22,15:23:24,/,341,29.0,97.105.19.58
181826,2019-01-22,15:25:51,toc,341,29.0,97.105.19.58
181840,2019-01-22,15:26:44,html-css,341,29.0,97.105.19.58
181862,2019-01-22,15:28:33,html-css/introduction,341,29.0,97.105.19.58
181870,2019-01-22,15:29:29,html-css/elements,341,29.0,97.105.19.58
...,...,...,...,...,...,...
817183,2021-02-09,21:02:55,search/search_index.json,341,29.0,172.124.70.146
817184,2021-02-09,21:03:20,appendix/further-reading/spring/seeder,341,29.0,172.124.70.146
817268,2021-02-10,08:31:18,appendix/code-standards/mysql,341,29.0,172.124.70.146
817269,2021-02-10,08:31:29,appendix/further-reading/spring/pagination,341,29.0,172.124.70.146


In [24]:
anomalies[anomalies.pages==343] #3 finding the specific user id for who accessed 343 pages

Unnamed: 0_level_0,pages,midband,ub,lb,pct_b,user_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-06-21,343,22.370564,322.155049,-277.413921,1.034767,804


In [25]:
df[df.user_id==804] ## looking at the data frame for user 804

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
719859,2020-11-03,10:15:51,javascript-i,804,132.0,69.91.64.132
720925,2020-11-04,09:00:55,javascript-i,804,132.0,69.91.64.132
721318,2020-11-04,11:40:16,javascript-i/javascript-with-html,804,132.0,69.91.64.132
721321,2020-11-04,11:40:28,javascript-i,804,132.0,69.91.64.132
721324,2020-11-04,11:40:33,javascript-i/introduction/primitive-types,804,132.0,69.91.64.132
...,...,...,...,...,...,...
987643,2021-06-21,14:17:37,appendix/further-reading/pagination,804,132.0,66.69.1.31
987644,2021-06-21,14:17:37,appendix/further-reading/authorization,804,132.0,66.69.1.31
987645,2021-06-21,14:17:58,appendix/further-reading/security-use-cases,804,132.0,66.69.1.31
987646,2021-06-21,14:17:58,appendix/further-reading/spring,804,132.0,66.69.1.31


All of these users have a b percentage that is greater than 1 for their page accesses. Although most of the specific user dataframes that correspond with these anamolies appear to have a human readable anmount of curriculumn access per day let's explore the 2 highest page access users that break the 200 mark further.

#### User 341 Access

In [26]:
user_341 = df[df.user_id==341] ## making our dataframe for user 341
user_804 = df[df.user_id==804] ## making our dataframe for user 804

In [27]:
user_341.head(10) ## previewing our user dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
181808,2019-01-22,15:23:24,/,341,29.0,97.105.19.58
181826,2019-01-22,15:25:51,toc,341,29.0,97.105.19.58
181840,2019-01-22,15:26:44,html-css,341,29.0,97.105.19.58
181862,2019-01-22,15:28:33,html-css/introduction,341,29.0,97.105.19.58
181870,2019-01-22,15:29:29,html-css/elements,341,29.0,97.105.19.58
181915,2019-01-22,15:31:24,html-css/introduction,341,29.0,97.105.19.58
181921,2019-01-22,15:31:32,html-css/forms,341,29.0,97.105.19.58
181949,2019-01-22,15:33:40,html-css/css-i,341,29.0,97.105.19.58
181951,2019-01-22,15:33:43,html-css/css-i/positioning,341,29.0,97.105.19.58
181953,2019-01-22,15:33:49,html-css/introduction,341,29.0,97.105.19.58


In [28]:
## makeing a page_views dataframe that counts the number of pages viewed by date 
## for user 341 than creating a sorted observed dataframe for easier reading

page_views = user_341.groupby(['date'])['page'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-03-03,272,170
2020-04-21,109,13
2020-07-13,52,37
2019-04-12,46,35
2019-03-20,39,32
2019-02-14,35,30
2019-03-11,31,12
2019-03-18,30,18
2019-02-20,26,12
2019-02-06,23,10


In [29]:
high_pages = user_341[user_341['date'] == '2019-03-03'] ## Exploring the high access day further
observed = high_pages.sort_values(by = 'time', ascending = False)
high_pages.head(30)

## we sorted by time to see if the user is accessing the curriculumn in a human readable
## amount of time

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
211312,2019-03-03,21:14:08,/,341,29.0,173.174.243.231
211313,2019-03-03,21:14:14,jquery,341,29.0,173.174.243.231
211314,2019-03-03,21:14:27,jquery/essential-methods/traversing,341,29.0,173.174.243.231
211315,2019-03-03,21:15:25,jquery/effects,341,29.0,173.174.243.231
211340,2019-03-03,22:52:05,html-css,341,29.0,204.44.112.76
211341,2019-03-03,22:52:06,javascript-i,341,29.0,204.44.112.76
211342,2019-03-03,22:52:06,java-i,341,29.0,204.44.112.76
211343,2019-03-03,22:52:06,java-ii,341,29.0,204.44.112.76
211344,2019-03-03,22:52:06,javascript-ii,341,29.0,204.44.112.76
211345,2019-03-03,22:52:06,jquery,341,29.0,204.44.112.76


In [30]:
high_pages.tail(30) ## looking at the end of the df to compare timestamp access

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
211588,2019-03-03,23:20:32,java-i/methods,341,29.0,173.174.243.231
211589,2019-03-03,23:20:38,java-ii,341,29.0,173.174.243.231
211590,2019-03-03,23:20:49,java-ii/object-oriented-programming,341,29.0,173.174.243.231
211591,2019-03-03,23:20:54,java-ii/arrays,341,29.0,173.174.243.231
211592,2019-03-03,23:20:57,java-ii/inheritance-and-polymorphism,341,29.0,173.174.243.231
211593,2019-03-03,23:21:01,java-ii/interfaces-and-abstract-classes,341,29.0,173.174.243.231
211594,2019-03-03,23:21:05,java-ii/collections,341,29.0,173.174.243.231
211595,2019-03-03,23:21:08,java-ii/annotations,341,29.0,173.174.243.231
211596,2019-03-03,23:21:12,java-ii/exceptions-and-error-handling,341,29.0,173.174.243.231
211597,2019-03-03,23:21:15,java-ii/file-io,341,29.0,173.174.243.231


In [31]:
observed.source_ip.value_counts() ## looking at out suspicious IP adresses

204.44.112.76      180
173.174.243.231     92
Name: source_ip, dtype: int64

#### User 804 Access

In [32]:
user_804.head(3) ## previewing our user dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
719859,2020-11-03,10:15:51,javascript-i,804,132.0,69.91.64.132
720925,2020-11-04,09:00:55,javascript-i,804,132.0,69.91.64.132
721318,2020-11-04,11:40:16,javascript-i/javascript-with-html,804,132.0,69.91.64.132


In [33]:
## makeing a page_views dataframe that counts the number of pages viewed by date 
## for user 804 than creating a sorted observed dataframe for easier reading

page_views = user_804.groupby(['date'])['page'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-06-21,343,334
2021-01-20,74,39
2020-11-23,63,32
2020-11-17,57,37
2020-11-20,50,21
2021-03-26,48,20
2020-11-24,47,27
2020-11-05,45,18
2021-03-01,39,24
2020-11-19,33,19


In [34]:
high_pages = user_804[user_804['date'] == '2021-06-21'] ## Exploring the high access day further
observed = high_pages.sort_values(by = 'time', ascending = False)
high_pages.head(30)

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
986984,2021-06-21,13:16:13,/,804,132.0,66.69.1.31
986985,2021-06-21,13:16:15,html-css,804,132.0,66.69.1.31
987108,2021-06-21,13:48:49,/,804,132.0,66.69.1.31
987110,2021-06-21,13:49:06,toc,804,132.0,66.69.1.31
987129,2021-06-21,13:51:13,main-pages_xXxXx.html,804,132.0,66.69.1.31
987155,2021-06-21,13:57:16,/,804,132.0,66.69.1.31
987178,2021-06-21,13:58:39,/,804,132.0,66.69.1.31
987179,2021-06-21,13:58:41,.,804,132.0,66.69.1.31
987180,2021-06-21,13:58:42,html-css,804,132.0,66.69.1.31
987181,2021-06-21,13:58:42,javascript-i,804,132.0,66.69.1.31


In [35]:
high_pages.tail(30) ## looking at the end to compare time stamps of access

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
987602,2021-06-21,14:15:26,further-reading/mysql/host-wildcards,804,132.0,66.69.1.31
987603,2021-06-21,14:15:26,extra-challenges/mysql/mysql-extra-exercises,804,132.0,66.69.1.31
987604,2021-06-21,14:15:28,further-reading/java/intellij-tomcat-configura...,804,132.0,66.69.1.31
987606,2021-06-21,14:15:47,further-reading/spring/pagination,804,132.0,66.69.1.31
987607,2021-06-21,14:15:48,further-reading/spring/authorization,804,132.0,66.69.1.31
987608,2021-06-21,14:15:48,further-reading/spring/security-use-cases,804,132.0,66.69.1.31
987609,2021-06-21,14:15:50,further-reading/spring/seeder,804,132.0,66.69.1.31
987610,2021-06-21,14:16:09,further-reading/spring/devtools-configuration,804,132.0,66.69.1.31
987611,2021-06-21,14:16:10,slides,804,132.0,66.69.1.31
987612,2021-06-21,14:16:10,pair-programming,804,132.0,66.69.1.31


User 804 seems to be accessing the curriculumn at a normal human readable rate (within multiple seconds and minutes between page) this is not deemed suspicious, possibly just a dedicated student that wanted to view the whole curriculumn to see what they are diving into.

#### Supscious Activity Takeaways

Looking at user 341 there is a suspicious IP address (204.44.112.76) that accesses 180 pages. It is suspicious because it is accessing pages at machine level speeds an example would be 15 pages in one second at the timestamp of 22:52:06 on March 3rd, 2019.

This could be evidence of web scraping and the IP address is suspicous because the 204 IP address was not the user's only IP address on March 3rd, the other IP address seemed to accessing pages at a human readable pace, but when the IP address switched to the one beggining with 204 the access speed per page was ramped up to machine like speeds.


#### Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

Were going to look at specifically data science cohorts for this question because Carl and I are in the data science program.

Using our domain expertise we know that these cohorts are Ada, Bayes, Curie, Darden, Easley, and Florence

And there corresponding cohort ID's are: 30, 34, 55, 59, 133, 137 we can obtain from the SQL database

In [36]:
df_lesson.head(3) ## <-- looking at lesson dataframe that is filtered for /'s

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61


In [37]:
df_lesson.cohort_id.value_counts()

28.0     77054
33.0     33392
59.0     32657
62.0     30356
29.0     30265
53.0     29644
24.0     28809
51.0     28390
57.0     27904
135.0    27510
56.0     27318
34.0     27096
22.0     25495
32.0     24631
23.0     24530
52.0     23441
58.0     23232
26.0     22179
55.0     22022
137.0    21215
31.0     20982
25.0     20912
132.0    19836
138.0    19474
133.0    18308
134.0    17010
27.0     16903
1.0      15520
61.0     14856
139.0    14209
165.0     9123
14.0      7036
21.0      5776
17.0      4311
13.0      2666
18.0      1687
166.0     1410
8.0       1289
19.0      1276
7.0        764
15.0       593
16.0       455
12.0       282
11.0       186
2.0         89
6.0         39
9.0          5
4.0          4
5.0          1
Name: cohort_id, dtype: int64

In [38]:
ds_df = df_lesson[(df_lesson.cohort_id == 30.0) | (df_lesson.cohort_id == 34.0)
                           | (df_lesson.cohort_id == 55.0) | (df_lesson.cohort_id == 59.0)
                           | (df_lesson.cohort_id == 133.0) | (df_lesson.cohort_id == 137.0)]

ds_df.cohort_id.value_counts()

## unfortunately the data doesn't have the Ada cohort but that is okay. We have plenty 
## of data from the other cohorts we can look at.

59.0     32657
34.0     27096
55.0     22022
137.0    21215
133.0    18308
Name: cohort_id, dtype: int64

In [39]:
## filtering down for more things that don't seem like lessons within the curriculumn 

ds_df = ds_df[~(ds_df['page'].str.contains('appendix|cohorts|examples|caps|github|coding-challenges \
                                                            |advanced-topics|extra|jpeg|ico|csv|project'))]

In [40]:
ds_df.cohort_id.value_counts() ## <-- making sure things were filtered down

59.0     29288
34.0     23443
137.0    19051
55.0     18927
133.0    15819
Name: cohort_id, dtype: int64

##### Bayes Cohort ID 34

In [41]:
bayes_df = ds_df[ds_df.cohort_id == 34.0] ## making bayes dataframe
bayes_df.cohort_id.value_counts() ## <-- quality assurance check

34.0    23443
Name: cohort_id, dtype: int64

In [42]:
bayes_df.head(3) ## looking at our dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
326053,2019-08-20,09:39:58,/,466,34.0,97.105.19.58
326054,2019-08-20,09:39:59,/,467,34.0,97.105.19.58
326055,2019-08-20,09:39:59,/,468,34.0,97.105.19.58


In [43]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic in bayes
page_views = bayes_df.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,2051,23
1-fundamentals/modern-data-scientist.jpg,653,21
1-fundamentals/AI-ML-DL-timeline.jpg,651,21
1-fundamentals/1.1-intro-to-data-science,643,21
search/search_index.json,608,19
6-regression/1-overview,521,21
10-anomaly-detection/1-overview,384,21
6-regression/5.0-evaluate,333,21
5-stats/3-probability-distributions,320,21
5-stats/4.2-compare-means,316,21


In [44]:
observed.describe() 

Unnamed: 0,count,nunique
count,398.0,398.0
mean,58.90201,8.007538
std,138.53835,8.003617
min,1.0,1.0
25%,4.0,2.0
50%,12.0,4.0
75%,61.25,19.0
max,2051.0,23.0


We are going to look at lessons clicked more than 12 times to view the low traffic (glossed over lessons) after looking at the numerical statistics for Bayes page view count

In [45]:
observed[observed['count'] > 12].tail(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
fundamentals/spreadsheets-overview,14,2
10-anomaly-detection/2-detecting-through-probability,14,10
timeseries/prep,14,4
sql/functions,14,1
clustering/Hospital-Distance-Clusters.jpg,14,5
11-nlp/exercises,14,9
fundamentals/git,13,6
sql/indexes,13,2
timeseries/acquire,13,3
6-regression/built-in-datasets,13,8


##### Curie Cohort ID 55

In [46]:
curie_df = ds_df[ds_df.cohort_id == 55.0] ## making bayes dataframe
curie_df.cohort_id.value_counts() ## <-- quality assurance check

55.0    18927
Name: cohort_id, dtype: int64

In [47]:
curie_df.head(3) ## looking at our dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
445314,2020-02-03,15:39:35,/,576,55.0,97.105.19.58
445315,2020-02-03,15:39:37,/,577,55.0,97.105.19.58
445317,2020-02-03,15:39:43,/,578,55.0,97.105.19.58


In [48]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic in curie
page_views = curie_df.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,1779,21
6-regression/1-overview,595,19
search/search_index.json,584,19
1-fundamentals/modern-data-scientist.jpg,467,19
1-fundamentals/AI-ML-DL-timeline.jpg,465,19
1-fundamentals/1.1-intro-to-data-science,461,19
3-sql/1-mysql-overview,441,19
10-anomaly-detection/1-overview,345,19
4-python/8.4.3-dataframes,260,19
4-python/8.4.4-advanced-dataframes,246,19


In [49]:
observed.describe()

Unnamed: 0,count,nunique
count,319.0,319.0
mean,59.332288,9.275862
std,127.600605,6.631288
min,1.0,1.0
25%,4.0,3.0
50%,20.0,8.0
75%,83.0,17.0
max,1779.0,21.0


We are going to look at lessons clicked more than 20 times to view the low traffic (glossed over lessons) after looking at the numerical statistics for Curie page view count

In [50]:
observed[observed['count'] > 20].tail(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
python/introduction-to-python,23,9
stats/compare-means,22,9
timeseries/modeling-lesson1,22,4
stats/probability-distributions,22,7
sql/databases,21,5
2-storytelling/misleading1_baseball.jpg,21,9
2-storytelling/misleading1_fox.jpg,21,9
sql/temporary-tables,21,6
2-storytelling/misleading3_deaths.jpg,21,9
nlp/prepare,21,8


##### Darden Cohort ID 59

In [51]:
darden_df = ds_df[ds_df.cohort_id == 59.0] ## making bayes dataframe
darden_df.cohort_id.value_counts() ## <-- quality assurance check

59.0    29288
Name: cohort_id, dtype: int64

In [52]:
darden_df.head(3) ## <-- looking at darden dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
597038,2020-07-13,14:34:44,/,678,59.0,76.201.20.193
597043,2020-07-13,14:37:22,/,679,59.0,24.28.146.155
597056,2020-07-13,14:38:28,/,680,59.0,136.50.56.155


In [53]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic in curie
page_views = curie_df.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,1779,21
6-regression/1-overview,595,19
search/search_index.json,584,19
1-fundamentals/modern-data-scientist.jpg,467,19
1-fundamentals/AI-ML-DL-timeline.jpg,465,19
1-fundamentals/1.1-intro-to-data-science,461,19
3-sql/1-mysql-overview,441,19
10-anomaly-detection/1-overview,345,19
4-python/8.4.3-dataframes,260,19
4-python/8.4.4-advanced-dataframes,246,19


In [54]:
observed.describe()

Unnamed: 0,count,nunique
count,319.0,319.0
mean,59.332288,9.275862
std,127.600605,6.631288
min,1.0,1.0
25%,4.0,3.0
50%,20.0,8.0
75%,83.0,17.0
max,1779.0,21.0


We are going to look at lessons clicked more than 66 times to view the low traffic (glossed over lessons) after looking at the numerical statistics for Bayes page view count

In [55]:
observed[observed['count'] > 66].tail(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
8-clustering/Hospital-Distance-Clusters.jpg,72,17
9-timeseries/1-overview,71,17
2-storytelling/2.1-understand,71,19
5-stats/Selecting_a_hypothesis_test.svg,69,16
10-anomaly-detection/3-discrete-probabilistic-methods,68,16
10-anomaly-detection/2-continuous-probabilistic-methods,68,16
8-clustering/2-about,68,17
4-python/1-overview,68,18
6-regression/4-explore,67,14
2-storytelling/3-tableau,67,17


##### Easley Cohort ID 133

In [56]:
easley_df = ds_df[ds_df.cohort_id == 133.0] ## making bayes dataframe
easley_df.cohort_id.value_counts() ## <-- quality assurance check

133.0    15819
Name: cohort_id, dtype: int64

In [57]:
easley_df.head(3) ## looking at easley dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
755214,2020-12-08,10:49:37,/,835,133.0,173.173.109.5
755216,2020-12-08,10:49:39,/,836,133.0,99.43.137.186
755217,2020-12-08,10:49:40,/,837,133.0,66.69.79.82


In [58]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic in easley
page_views = easley_df.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,1543,17
classification/scale_features_or_not.svg,561,17
classification/overview,540,17
fundamentals/AI-ML-DL-timeline.jpg,409,17
fundamentals/modern-data-scientist.jpg,408,17
fundamentals/intro-to-data-science,401,17
search/search_index.json,359,15
sql/mysql-overview,338,15
anomaly-detection/overview,258,16
stats/compare-means,226,16


In [59]:
observed.describe()

Unnamed: 0,count,nunique
count,179.0,179.0
mean,88.374302,12.089385
std,142.732009,6.031085
min,1.0,1.0
25%,9.5,7.0
50%,63.0,16.0
75%,114.5,16.0
max,1543.0,17.0


We are going to look at lessons clicked more than 63 times to view the low traffic (glossed over lessons) after looking at the numerical statistics for Easly page view count

In [60]:
observed[observed['count'] > 63].tail(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
storytelling/overview,77,15
storytelling/tableau,76,15
anomaly-detection/detecting-timeseries-anomalies,74,14
timeseries/overview,72,17
sql/indexes,72,16
sql/subqueries,70,16
sql/clauses-overview,70,16
nlp/prepare,68,16
timeseries/modeling-lesson1,67,16
timeseries/prep,66,15


##### Florence Cohort ID 137

In [61]:
florence_df = ds_df[ds_df.cohort_id == 137.0] ## making bayes dataframe
florence_df.cohort_id.value_counts() ## <-- quality assurance check

137.0    19051
Name: cohort_id, dtype: int64

In [62]:
florence_df.head(3) ## looking at florence dataframe

Unnamed: 0,date,time,page,user_id,cohort_id,source_ip
847090,2021-03-15,15:22:48,/,896,137.0,162.205.226.39
847092,2021-03-15,15:23:17,/,897,137.0,72.181.115.96
847094,2021-03-15,15:23:33,/,898,137.0,208.84.155.68


In [63]:
## grouping by page and doing an overall count of occurences
## per page to figure which lesson has the most overall traffic in florence
page_views = florence_df.groupby(['page'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = False)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
/,1406,22
fundamentals/modern-data-scientist.jpg,758,21
fundamentals/intro-to-data-science,755,21
fundamentals/AI-ML-DL-timeline.jpg,752,21
search/search_index.json,684,20
classification/scale_features_or_not.svg,584,22
classification/overview,549,22
sql/mysql-overview,400,22
python/data-types-and-variables,271,21
classification/evaluation,263,22


In [64]:
observed.describe()

Unnamed: 0,count,nunique
count,187.0,187.0
mean,101.877005,13.176471
std,163.647802,8.700876
min,1.0,1.0
25%,3.5,3.0
50%,66.0,19.0
75%,146.0,21.0
max,1406.0,22.0


We are going to look at lessons clicked more than 66 times to view the low traffic (glossed over lessons) after looking at the numerical statistics for Florence page view count

In [65]:
observed[observed['count'] > 66].tail(15)

Unnamed: 0_level_0,count,nunique
page,Unnamed: 1_level_1,Unnamed: 2_level_1
clustering/overview,82,20
sql/relationships-overview,80,19
fundamentals/cli/overview,77,21
storytelling/tableau,71,20
fundamentals/cli/intro,71,20
sql/clauses-overview,70,19
classification/user-defined-functions,70,20
fundamentals/cli/listing-files,69,21
storytelling/refine-present,68,19
1-fundamentals/1.1-intro-to-data-science,67,21


#### Cohort Takeaways
 - Bayes Most Common Viewed Lessons:
     - 1-fundamentals/modern-data-scientist.jpg
     - 1-fundamentals/1.1-intro-to-data-science
 - Curie Most Common Viewed Lessons: 
     - 6-regression/1-overview
     - 1-fundamentals/modern-data-scientist.jpg
 - Darden Most Common Viewed Lessons: 
     - 6-regression/1-overview
     - 1-fundamentals/modern-data-scientist.jpg	
 - Easley Most Common Viewed Lessons:
     - classification/scale_features_or_not.svg
     - classification/overview
 - Florence Most Common Viewed Lessons:
     - fundamentals/modern-data-scientist.jpg
     - classification/scale_features_or_not.svg
     

#### Significance in Data Science Cohort Lesson Activity Difference

Based on the filtering for lessons that was done and dividing each cohort into their own respective dataframe.

We can see that 