# **Imports**

In [2]:
import itertools
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import math
import prepare
import acquire
import env
from sklearn import metrics
from random import randint
from matplotlib import style
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

# Questions to answer:

1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?
4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?
7. Which lessons are least accessed?
8. Anything else I should be aware of?

In [3]:
df = acquire.get_cohort_curr_data()

In [4]:
df.head(2)

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1


In [5]:
df.rename(columns = {'name': 'cohort', 'path':'endpoint'}, inplace = True)

# 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [6]:
#creating new feature which gives name instead of number.
df['program'] = df.program_id.replace([1,2,3,4], ['full_stack_php', 'java', 'data_science', 'front_end_program'])

In [7]:
# lesson which atracts most attention for program
fsp = pd.DataFrame(df[df.program_id == 1]).set_index('date')
fsp.head(5)

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2018-01-26,10:14:47,/,11,1.0,97.105.19.61,1,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php


In [8]:
fsp[['cohort', 'endpoint']].groupby('cohort').endpoint.count()

cohort
Arches        8890
Badlands        93
Denali           4
Everglades       1
Franklin        72
Glacier        598
Hampton       1712
Ike            253
Joshua         302
Kings         2845
Lassen        9587
Olympic       4954
Quincy        1237
Name: endpoint, dtype: int64

In [9]:
jv = pd.DataFrame(df[df.program_id == 2]).set_index('date')
jv.head(10)

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:41,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:46,javascript-i/conditionals,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:48,javascript-i/functions,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:56:59,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:58:26,javascript-i/functions,4,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,09:59:22,mkdocs/search_index.json,5,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:02,javascript-i/introduction/working-with-data-ty...,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:37,/,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
2018-01-26,10:00:39,javascript-i,6,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java


In [10]:
ds = pd.DataFrame(df[df.program_id == 3]).set_index('date')
ds.head()

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-08-20,09:39:58,/,466,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:39:59,/,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:39:59,/,468,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:02,/,469,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:08,/,470,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science


In [11]:
fep = pd.DataFrame(df[df.program_id == 4]).set_index('date')
fep

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-03-22,19:01:49,/,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:01:54,content/html-css,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:01:54,content/html-css/gitbook/images/favicon.ico,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:02:47,content/html-css,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program
2018-03-22,19:02:52,content/html-css/introduction.html,152,9.0,207.68.209.17,9,Apollo,#apollo,2015-03-30,2015-07-29,2016-06-14 19:52:26,2016-06-14 19:52:26,,4,front_end_program


In [12]:
ds_cohort = pd.DataFrame(ds[ds.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [13]:
df[df.cohort != 'Everglades']

Unnamed: 0,date,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1,full_stack_php
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2,java
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
847325,2021-04-21,16:41:51,jquery/personal-site,64,28.0,71.150.217.33,28,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2,java
847326,2021-04-21,16:42:02,jquery/mapbox-api,64,28.0,71.150.217.33,28,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2,java
847327,2021-04-21,16:42:09,jquery/ajax/weather-map,64,28.0,71.150.217.33,28,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2,java
847328,2021-04-21,16:44:37,anomaly-detection/discrete-probabilistic-methods,744,28.0,24.160.137.86,28,Staff,#,2014-02-04,2014-02-04,2018-12-06 17:04:19,2018-12-06 17:04:19,,2,java


In [14]:
df_cohort = pd.DataFrame(df[df.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
df_cohort.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,endpoint
cohort,endpoint,Unnamed: 2_level_1
Andromeda,toc,638
Andromeda,javascript-i,516
Andromeda,spring,487
Andromeda,java-iii,433
Andromeda,java-ii,398


In [15]:
df_cohort_list = list(df.cohort.value_counts().index)

In [16]:
ds_df = pd.DataFrame()
for cohort in df_cohort_list:
    if cohort != 'Everglades':
        df_cohort = pd.DataFrame(df[df.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = df_cohort.loc[[cohort]]
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        

-----------------
cohort  endpoint    
Staff   javascript-i    1817
        spring          1403
        java-iii        1393
Name: endpoint, dtype: int64
-----------------
cohort  endpoint                
Ceres   search/search_index.json    1380
        javascript-i                1003
        toc                          911
Name: endpoint, dtype: int64
-----------------
cohort  endpoint    
Zion    toc             1465
        javascript-i     897
        java-iii         753
Name: endpoint, dtype: int64
-----------------
cohort   endpoint                
Jupiter  toc                         1866
         search/search_index.json     998
         javascript-i                 926
Name: endpoint, dtype: int64
-----------------
cohort   endpoint                
Fortuna  toc                         1293
         search/search_index.json    1020
         java-iii                     786
Name: endpoint, dtype: int64
-----------------
cohort     endpoint    
Voyageurs  javascript-i    884


----

## Full Stack PHP Program

In [17]:
fsp_cohort = pd.DataFrame(fsp[fsp.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [18]:
fsp_cohort_list = list(fsp.cohort.value_counts().index)

In [19]:
import statistics
from statistics import mode

In [20]:
for cohort in fsp_cohort_list:
    if cohort != 'Everglades':
        max_list = []
        df_cohort = pd.DataFrame(fsp[fsp.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = fsp_cohort.loc[[cohort]]
        #most_frequent = list(set(fsp_cohort.loc[[cohort]])
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        print('-----------------')
    
        
        


-----------------
cohort  endpoint    
Lassen  index.html      877
        javascript-i    233
        java-iii        224
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint     
Arches  javascript-i     294
        html-css         215
        javascript-ii    204
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint    
Olympic  javascript-i    128
         java-i           76
         jquery           71
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                                        
Kings   index.html                                          84
        content/laravel/intro                               83
        content/laravel/intro/application-structure.html    63
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint
Hampton  java-iii    57
         appendix    55
         java-i      46
Name: endpoint, dtype: int64
-----------------
----------------

#### java-i appears to be the most frequent lesson among the cohorts in the full-stack-php program

------

## Java Program

In [21]:
jv_cohort = pd.DataFrame(jv[jv.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())


In [22]:
jv_cohort_list = list(jv.cohort.value_counts().index)
jv_cohort_list

['Staff',
 'Ceres',
 'Zion',
 'Jupiter',
 'Fortuna',
 'Voyageurs',
 'Ganymede',
 'Apex',
 'Deimos',
 'Teddy',
 'Hyperion',
 'Betelgeuse',
 'Ulysses',
 'Europa',
 'Xanadu',
 'Wrangell',
 'Andromeda',
 'Kalypso',
 'Yosemite',
 'Bash',
 'Luna',
 'Marco',
 'Sequoia',
 'Neptune',
 'Pinnacles',
 'Oberon',
 'Niagara',
 'Mammoth']

In [23]:
for cohort in jv_cohort_list:
    #if cohort:
        jv_cohort = pd.DataFrame(jv[jv.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
        cep_df = jv_cohort.loc[[cohort]]
        #most_frequent = list(set(fsp_cohort.loc[[cohort]])
        cep_df = cep_df.endpoint.nlargest(3)
        print('-----------------')
        print(cep_df)
        print('-----------------')
    
        
        


-----------------
cohort  endpoint    
Staff   javascript-i    1817
        spring          1403
        java-iii        1393
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                
Ceres   search/search_index.json    1380
        javascript-i                1003
        toc                          911
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint    
Zion    toc             1465
        javascript-i     897
        java-iii         753
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint                
Jupiter  toc                         1866
         search/search_index.json     998
         javascript-i                 926
Name: endpoint, dtype: int64
-----------------
-----------------
cohort   endpoint                
Fortuna  toc                         1293
         search/search_index.json    1020
         java-iii                     786
Name: endpoint, dtype: int64
-

#### javascript - i appears to be the most frequent lesson among the cohorts in the java program

------

## Data Science

In [24]:
ds_cohort = pd.DataFrame(ds[ds.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [25]:
ds_cohort_list = list(ds.cohort.value_counts().index)

In [26]:
for cohort in ds_cohort_list:
    #if cohort != 'Everglades':

    ds_cohort = pd.DataFrame(ds[ds.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
    cep_df = ds_cohort.loc[[cohort]]
    #most_frequent = list(set(fsp_cohort.loc[[cohort]])
    cep_df = cep_df.endpoint.nlargest(3)
    print('-----------------')
    print(cep_df)
    print('-----------------')

        
        


-----------------
cohort  endpoint                                
Darden  classification/overview                     1109
        classification/scale_features_or_not.svg     943
        sql/mysql-overview                           774
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                                
Bayes   1-fundamentals/modern-data-scientist.jpg    650
        1-fundamentals/AI-ML-DL-timeline.jpg        648
        1-fundamentals/1.1-intro-to-data-science    640
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                                
Curie   6-regression/1-overview                     595
        search/search_index.json                    538
        1-fundamentals/modern-data-scientist.jpg    467
Name: endpoint, dtype: int64
-----------------
-----------------
cohort  endpoint                                                     
Easley  classification/scale_features_or_not.svg               

#### Fundamentals/modern-data-scientist was the most occuring lesson among the cohorts in ds program

-----

# Front End Program

In [27]:
fep_cohort = pd.DataFrame(fep[fep.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())

In [28]:
fep_cohort_list = list(fep.cohort.value_counts().index)

In [29]:
for cohort in fep_cohort_list:
    #if cohort != 'Everglades':

    fep_cohort = pd.DataFrame(fep[fep.endpoint != '/'].groupby('cohort')['endpoint'].value_counts())
    cep_df = fep_cohort.loc[[cohort]]
    #most_frequent = list(set(fsp_cohort.loc[[cohort]])
    cep_df = cep_df.endpoint.nlargest(3)
    print('-----------------')
    print(cep_df)
    print('-----------------')

        
        


-----------------
cohort  endpoint                                   
Apollo  content/html-css                               2
        content/html-css/gitbook/images/favicon.ico    1
        content/html-css/introduction.html             1
Name: endpoint, dtype: int64
-----------------


#### html-css is the most occuring lesson among fep cohort

----

----

# 2.) Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

In [30]:
ds = ds[ds.endpoint != '/']
ds.head()

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-08-20,09:40:28,3-sql/1-mysql-overview,470,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/bad-charts,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading1_baseball.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading1_fox.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading3_deaths.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science


In [31]:
#want to compare the count of each endpoint to see which cohort had higher in one than other.
df_o = pd.DataFrame(ds.groupby(['cohort'])['endpoint'].value_counts()).rename(columns = {'endpoint':'count'})

df_o.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
cohort,endpoint,Unnamed: 2_level_1
Bayes,1-fundamentals/modern-data-scientist.jpg,650
Bayes,1-fundamentals/AI-ML-DL-timeline.jpg,648
Bayes,1-fundamentals/1.1-intro-to-data-science,640
Bayes,search/search_index.json,588
Bayes,6-regression/1-overview,521


In [35]:
df_o = df_o.reset_index(level=['cohort','endpoint'])


KeyError: 'Requested level (cohort) does not match index name (None)'

In [40]:
df_o.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1758 entries, 0 to 1757
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cohort    1758 non-null   object
 1   endpoint  1758 non-null   object
 2   count     1758 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 41.3+ KB


# 3.) Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [42]:
ds_cohort = pd.DataFrame(ds[ds.endpoint != '/'])
ds_cohort.head()

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-08-20,09:40:28,3-sql/1-mysql-overview,470,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/bad-charts,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading1_baseball.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading1_fox.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2019-08-20,09:40:30,2-storytelling/misleading3_deaths.jpg,467,34.0,97.105.19.58,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science


In [43]:
ds_user_list = list(ds.user_id.value_counts().index)

In [44]:
ds1 = pd.DataFrame(ds_cohort.groupby(['user_id', 'endpoint'])['endpoint'].count())
ds1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,endpoint
user_id,endpoint,Unnamed: 2_level_1
143,AI-ML-DL-timeline.jpg,2
143,anomaly-detection/AnomalyDetectionCartoon.jpeg,7
143,anomaly-detection/continuous-probabilistic-methods,1
143,anomaly-detection/overview,7
143,appendix/data_science_community,1


In [45]:
for user_id in ds_user_list:
    ds_users = pd.DataFrame(ds_cohort.groupby(['user_id', 'endpoint'])['endpoint'].count())
    cep_df = ds_users.loc[[user_id]]
    cep_df = cep_df.endpoint.sum()
    print('-----------------')
    print(f'user id: {user_id}')
    print(f'endpoint total:{cep_df}')
    print('-----------------')



-----------------
user id: 581
endpoint total:3278
-----------------
-----------------
user id: 685
endpoint total:3201
-----------------
-----------------
user id: 485
endpoint total:3077
-----------------
-----------------
user id: 692
endpoint total:2012
-----------------
-----------------
user id: 475
endpoint total:2002
-----------------
-----------------
user id: 476
endpoint total:1886
-----------------
-----------------
user id: 698
endpoint total:1857
-----------------
-----------------
user id: 479
endpoint total:1847
-----------------
-----------------
user id: 845
endpoint total:1835
-----------------
-----------------
user id: 590
endpoint total:1527
-----------------
-----------------
user id: 689
endpoint total:1517
-----------------
-----------------
user id: 576
endpoint total:1515
-----------------
-----------------
user id: 688
endpoint total:1504
-----------------
-----------------
user id: 478
endpoint total:1477
-----------------
-----------------
user id: 584
end

User 679 has the lowest count of endpoints total compared to the other users.

We know this user is in the Darden cohort and only utilized the fundamentals lesson and advanced topics. 

In [46]:
ds_cohort[ds_cohort.user_id == 679]

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-07-13,14:39:21,13-advanced-topics/1-tidy-data,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,14:39:36,1-fundamentals/1.1-intro-to-data-science,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,14:39:37,1-fundamentals/AI-ML-DL-timeline.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,14:39:37,1-fundamentals/modern-data-scientist.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,15:49:31,1-fundamentals/1.1-intro-to-data-science,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,15:49:32,1-fundamentals/modern-data-scientist.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-13,15:49:32,1-fundamentals/AI-ML-DL-timeline.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-14,08:05:15,1-fundamentals/1.1-intro-to-data-science,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-14,08:05:15,1-fundamentals/AI-ML-DL-timeline.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science
2020-07-14,08:05:15,1-fundamentals/modern-data-scientist.jpg,679,59.0,24.28.146.155,59,Darden,#darden,2020-07-13,2021-01-12,2020-07-13 18:32:19,2020-07-13 18:32:19,,3,data_science


In [47]:
ds_cohort[ds_cohort.user_id == 650]

Unnamed: 0_level_0,time,endpoint,user_id,cohort_id,ip,id,cohort,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,program
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2020-05-07,10:35:52,appendix/professional-development/virtual-meet...,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-06-25,10:19:01,appendix/cli-git-overview,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-06-25,10:20:00,appendix/ds-environment-setup,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-07-03,15:15:21,11-nlp/3-acquire,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-07-06,11:41:32,appendix/ds-environment-setup,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-08-09,13:18:23,python/intro-to-matplotlib,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2020-08-09,14:06:52,python/intro-to-matplotlib,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2021-02-12,06:38:46,stats/probability-distributions,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2021-02-12,06:44:56,stats/compare-means,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
2021-02-12,06:57:04,stats/correlation,650,34.0,70.123.225.30,34,Bayes,#,2019-08-19,2020-01-30,2019-08-20 14:38:55,2019-08-20 14:38:55,,3,data_science
