In [1]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
import env

import Naomi_wrangle as w

## Questions Answered:
  - 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?
  - 2. Which lessons are least accessed?
  - 3.  At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
  - 4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?
  - 5. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?



## Executive Summary:
<div class="alert alert-block alert-warning">

* **Question One; Lessons most trafficked**: 
    * Data Science - classification 
    * Full Stack Front End - HTML-CSS
    * Full Stack Java - javascript-i
    * Full Stack PHP - index.html

* **Question Two; Lessons least trafficked**:
    * Data Science Program - NLP
    * Full Stack Front End - All except HTML-CSS
    * Full Stack Java - Professional Development
    * Full Stack PHP - HTML-CSS

* **Question Three; Cross curriculum access in 2019**:
    * The year began with access being restricted, but begining in August restriciton was lifted and students could access pages cross-curriculum
    
* **Question Four; Suspicious Activity**:
</div>

# Acquire Data

In [2]:
# acquire data from SQL and save to dataframe
df = w.wrangle_curriculum_data()
df.head(3)

Unnamed: 0_level_0,time,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,program_name,end_page
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-26,09:55:03,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,/
2018-01-26,09:56:02,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,java-ii
2018-01-26,09:56:05,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22 00:00:00,2016-02-06 00:00:00,2016-06-14 19:52:26,2016-06-14 19:52:26,full_stack_php,java-ii/object-oriented-programming


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 816192 entries, 2018-01-26 to 2021-04-21
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   time          816192 non-null  object
 1   user_id       816192 non-null  int64 
 2   cohort_id     816192 non-null  object
 3   ip            816192 non-null  object
 4   id            816192 non-null  object
 5   name          816192 non-null  object
 6   slack         816192 non-null  object
 7   start_date    816192 non-null  object
 8   end_date      816192 non-null  object
 9   created_at    816192 non-null  object
 10  updated_at    816192 non-null  object
 11  program_name  816192 non-null  object
 12  end_page      816192 non-null  object
dtypes: int64(1), object(12)
memory usage: 87.2+ MB


# Question 1: Which lesson appears to attract the most traffic consistently across cohorts (per program)?

#### Plan to Answer: 
- Create dataframes of students grouped by program
- Use `value_counts` and `nlargest` methods over the `end_page` column to determine which has the highest number of "hits"


In [4]:
# For Data Science:
ds_cohorts = df.loc[df["program_name"] == "data_science"]
ds_cohorts.end_page.value_counts().nlargest(3)

/                           8358
search/search_index.json    2203
classification/overview     1785
Name: end_page, dtype: int64

In [5]:
# For Full-Stack PHP
full_stack_php_cohorts = df.loc[df["program_name"] == "full_stack_php"]
full_stack_php_cohorts.end_page.value_counts().nlargest(3)

/               1681
index.html      1011
javascript-i     736
Name: end_page, dtype: int64

In [6]:
# For Full-Stack Java
full_stack_java_cohorts = df.loc[df["program_name"] == "full_stack_java"]
full_stack_java_cohorts.end_page.value_counts().nlargest(3)

/               29474
toc             16517
javascript-i    15640
Name: end_page, dtype: int64

In [7]:
# For Front End
front_end_cohorts = df.loc[df["program_name"] == "front_end"]
front_end_cohorts.end_page.value_counts().nlargest(3)

content/html-css                               2
/                                              1
content/html-css/gitbook/images/favicon.ico    1
Name: end_page, dtype: int64

<div class="alert alert-block alert-success">

**Answer**

|Program| Most Trafficked|
|:-------|:---------------|
|**Data Science**| Classification Overview|
|**Full Stack PHP** | Java Script-i|
|**Full Stack Java** | Java Script-i|
|**Front End** | HTML-css|
</div>

# Question 2: Which lessons are least accessed?

#### Plan to Answer: 
- Use previous question's dataframes of students grouped by program
- Use `value_counts` and `nsmallest` methods over the `end_page` column to determine which has the lowest number of "hits"


In [8]:
# For Data Science:
ds_cohorts.end_page.value_counts().nsmallest(105)


nlp                                             1
regression/feature_engineering_into_modeling    1
json-responses                                  1
2.1-spreadsheets-overview                       1
introduction-to-python                          1
                                               ..
imports                                         1
java-i/console-io                               1
appendix/univariate_regression_in_excel         1
6-regression/8-Project                          1
timeseries/E2E_example                          2
Name: end_page, Length: 105, dtype: int64

In [9]:
# 105 is the first instance of two hits for the page

In [10]:
# For Full-Stack PHP
full_stack_php_cohorts.end_page.value_counts().nsmallest(210)


slides/threads                                1
2-storytelling/project                        1
appendix/data-viz-references                  1
4-python/6-imports                            1
4-python/project                              1
                                             ..
content/primitive-types.html                  2
login                                         3
content/examples/git                          3
content/examples/php/if_else.php              3
content/mysql/basic-statements/delete.html    3
Name: end_page, Length: 210, dtype: int64

In [11]:
# 206 is the first instance of two hits for the page

In [12]:
# For Full-Stack Java
full_stack_java_cohorts.end_page.value_counts().nsmallest()


professional-development/professional-resume    1
javascript-i/intruduction/operators             1
mini_exercises                                  1
teams                                           1
sgithubtudents/1215                             1
Name: end_page, dtype: int64

In [13]:
# 344 is the first instance of two pages being the lowest value_count which is mapbox_api

In [14]:
# For Front End
front_end_cohorts.end_page.value_counts().nsmallest()


/                                              1
content/html-css/gitbook/images/favicon.ico    1
content/html-css/introduction.html             1
content/html-css                               2
Name: end_page, dtype: int64

<div class="alert alert-block alert-success">

**Answer**

|Program| Least Trafficked| Number of Pages Before Two Hits is the Lowest Number per Page|
|:-------|:---------------|:-----|
|**Data Science**| NLP|105 pages|
|**Full Stack PHP** | Slides/Threads|206 pages|
|**Full Stack Java** |Professional Development/Professional Resume| 344 pages|
|**Front End** | all pages except HTM-CSS| 3 pages *(represents 25% of total)*|
</div>

# Question 3: At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

In [15]:
# creating dataframes for the programs

ds_df = df[(df.program_name == 'data_science')]

web_df = df.copy()
web_df.program_name = web_df.program_name.replace({'full_stack_php':'web_dev','full_stack_java':'web_dev','front_end': 'web_dev'})

wd_df = web_df[(web_df.program_name == 'web_dev')]

In [16]:
# zeroing in on 2019
yr_2019=df[df.index.year == 2019]

In [17]:
# creating data frames for the programs only of the year 2019
ds_df_19 = yr_2019[(yr_2019.program_name == 'data_science')]

wd_df_19 = web_df[(web_df.program_name == 'web_dev')]

In [19]:
# unique pages visited 
ds_page= list(ds_df_19.end_page.unique())

# list of unique pages in web dev
web_page= list(wd_df_19.end_page.unique())

# create a loop to cross check a page visited by ds students that matches with unique pages from web dev program

count= 0
for x in web_page:
    if x in ds_page:
        
        count+= 1       
print(count)

95


In [24]:
# Focusing on the month of August where there seems to be the lowest dip
Aug_2019=yr_2019[yr_2019.index.month == 8]
# df for data science of August 2019
ds_Aug = Aug_2019[(Aug_2019.program_name == 'data_science')]
# df for web development of August 2019
wd_Aug = Aug_2019[(Aug_2019.program_name == "web_dev")]

In [27]:
# checking for cross-curriculum page endpoints for August: there are none
ds_Aug_page = list(ds_Aug.end_page.unique())

web_Aug_page = list(wd_Aug.end_page.unique())

count = 0
for x in ds_Aug_page:
    if x in web_Aug_page:
        count += 1
        
print(count)

0


In [28]:
# Focusing on the month of September where there seems to be the lowest dip
Sep_2019=yr_2019[yr_2019.index.month == 8]
# df for data science of August 2019
ds_Sep = Sep_2019[(Sep_2019.program_name == 'data_science')]
# df for web development of August 2019
wd_Sep = Sep_2019[(Sep_2019.program_name == "web_dev")]

In [29]:
# checking for cross-curriculum page endpoints for September: there are none
ds_Sep_page = list(ds_Sep.end_page.unique())

web_Sep_page = list(wd_Sep.end_page.unique())

count = 0
for x in ds_Sep_page:
    if x in web_Sep_page:
        count += 1
        
print(count)

0
