# **Imports**

In [5]:
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
import prepare
import acquire
import env
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

# Questions to answer:

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
7. Which lessons are least accessed?
8. Anything else I should be aware of?

In [6]:
df = acquire.get_cohort_curr_data()

In [8]:
df.head(2)

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [31]:
df.rename(columns = {'name': 'cohort', 'path':'endpoint'}, inplace = True)

#### 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [32]:
#creating new feature which gives name instead of number.
df['program'] = df.program_id.replace([1,2,3,4], ['full_stack_php', 'java', 'data_science', 'front_end_program'])

In [101]:
# lesson which atracts most attention for program
fsp = pd.DataFrame(df[df.program_id == 1]).set_index('date')
fsp.head(5)

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,10:14:47,/,11,1.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php


In [39]:
fsp[['cohort', 'endpoint']].groupby('cohort').endpoint.count()

cohort
Arches        8890
Badlands        93
Denali           4
Everglades       1
Franklin        72
Glacier        598
Hampton       1712
Ike            253
Joshua         302
Kings         2845
Lassen        9587
Olympic       4954
Quincy        1237
Name: endpoint, dtype: int64

In [102]:
jv = pd.DataFrame(df[df.program_id == 2]).set_index('date')
jv.head(10)

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:41,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:46,javascript-i/conditionals,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:48,javascript-i/functions,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:59,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:58:26,javascript-i/functions,4,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:59:22,mkdocs/search_index.json,5,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:02,javascript-i/introduction/working-with-data-ty...,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:37,/,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:39,javascript-i,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java


In [103]:
ds = pd.DataFrame(df[df.program_id == 3]).set_index('date')
ds.head()

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-08-20,09:39:58,/,466,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:39:59,/,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:39:59,/,468,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:02,/,469,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:08,/,470,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science


In [104]:
fep = pd.DataFrame(df[df.program_id == 4]).set_index('date')
fep

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-03-22,19:01:49,/,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:01:54,content/html-css,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:01:54,content/html-css/gitbook/images/favicon.ico,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:02:47,content/html-css,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:02:52,content/html-css/introduction.html,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program


In [105]:
ds_cohort = pd.DataFrame(ds[ds.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [81]:
df[df.cohort != 'Everglades']

Unnamed: 0,date,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
507761,2020-05-06,19:06:32,/,649,5.0,70.125.150.41,5,Everglades,#everglades,2014-11-18,2015-02-24,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php


In [106]:
df_cohort = pd.DataFrame(df[df.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [107]:
df_cohort_list = list(df.cohort.value_counts().index)

In [97]:
ds_df = pd.DataFrame()
for cohort in df_cohort_list:
    if cohort != 'Everglades':
        df_cohort = pd.DataFrame(df[df.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = df_cohort.loc[[cohort]]
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        

-----------------
cohort  endpoint    
Staff   javascript-i    1817
        spring          1403
        java-iii        1393
Name: endpoint, dtype: int64
-----------------
cohort  endpoint                
Ceres   search/search_index.json    1380
        javascript-i                1003
        toc                          911
Name: endpoint, dtype: int64
-----------------
cohort  endpoint    
Zion    toc             1465
        javascript-i     897
        java-iii         753
Name: endpoint, dtype: int64
-----------------
cohort   endpoint                
Jupiter  toc                         1866
         search/search_index.json     998
         javascript-i                 926
Name: endpoint, dtype: int64
-----------------
cohort   endpoint                
Fortuna  toc                         1293
         search/search_index.json    1020
         java-iii                     786
Name: endpoint, dtype: int64
-----------------
cohort     endpoint    
Voyageurs  javascript-i    884


### Full Stack PHP Program

In [108]:
fsp_cohort = pd.DataFrame(fsp[fsp.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [109]:
fsp_cohort_list = list(fsp.cohort.value_counts().index)

In [112]:
import statistics
from statistics import mode

In [163]:
most_frequent = max(fsp_cohort.loc[[cohort]].endpoint)


1

In [168]:
for cohort in fsp_cohort_list:
    if cohort != 'Everglades':
        max_list = []
        df_cohort = pd.DataFrame(fsp[fsp.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = fsp_cohort.loc[[cohort]]
        #most_frequent = list(set(fsp_cohort.loc[[cohort]])
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        print('-----------------')
    
        
        


-----------------
cohort  endpoint    
Lassen  index.html      877
        javascript-i    233
        java-iii        224
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint     
Arches  javascript-i     294
        html-css         215
        javascript-ii    204
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint    
Olympic  javascript-i    128
         java-i           76
         jquery           71
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                                        
Kings   index.html                                          84
        content/laravel/intro                               83
        content/laravel/intro/application-structure.html    63
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint
Hampton  java-iii    57
         appendix    55
         java-i      46
Name: endpoint, dtype: int64
-----------------
----------------

#### java-i appears to be the most frequent lesson among the cohorts in the full-stack-php program

### Java Program

In [254]:
jv_cohort = pd.DataFrame(jv[jv.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())


TypeError: Index(...) must be called with a collection of some kind, 'count' was passed

In [252]:
jv_cohort = pd.DataFrame(jv_cohort.loc[:,pd.IndexSlice['endpoint']])

In [253]:
jv_cohort.reset_index()

ValueError: cannot insert endpoint, already exists

In [170]:
jv_cohort_list = list(jv.cohort.value_counts().index)

In [178]:
for cohort in fsp_cohort_list:
    if cohort != 'Everglades' and cohort != 'Lassen' and cohort != 'Arches':
        max_list = []
        df_cohort = pd.DataFrame(jv[jv.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = jv_cohort.loc[[cohort]]
        #most_frequent = list(set(fsp_cohort.loc[[cohort]])
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        print('-----------------')
    
        
        


KeyError: "['Olympic'] not in index"

#### java-i appears to be the most frequent lesson among the cohorts in the full-stack-php program