In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare
import wrangle

### Acquire

In [2]:
# Created an acquire.py file that runs a SQL query to join the databases from the codeup library, and created a
# pandas dataframe with that query, after that I saved it to a csv, and have to function check for the csv.
df = acquire.get_cohort_data()

In [3]:
info_df = acquire.get_cohort_information_data()

In [4]:
df.shape

(847330, 15)

In [5]:
df = df.astype({"cohort_id": int})

In [6]:
info_df

Unnamed: 0_level_0,name,start_date,end_date,program_id
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Arches,2014-02-04,2014-04-22,1
2,Badlands,2014-06-04,2014-08-22,1
3,Carlsbad,2014-09-04,2014-11-05,1
4,Denali,2014-10-20,2015-01-18,1
5,Everglades,2014-11-18,2015-02-24,1
6,Franklin,2015-02-03,2015-05-26,1
7,Glacier,2015-06-05,2015-10-06,1
8,Hampton,2015-09-22,2016-02-06,1
9,Apollo,2015-03-30,2015-07-29,4
10,Balboa,2015-11-03,2016-03-11,4


In [7]:
new_df = df.join(info_df,on = 'cohort_id',how = 'outer',lsuffix = 'str')

In [8]:
new_df.isnull().sum()

date                  5
time                  5
path                  6
user_id               5
cohort_id             0
ip                    5
id               774930
namestr          774930
slack            774930
start_datestr    774930
end_datestr      774930
created_at       774930
updated_at       774930
deleted_at       847335
program_idstr    774930
name                  0
start_date            0
end_date              0
program_id            0
dtype: int64

In [9]:
unknown_cohorts = df[df.start_date.isnull()]

In [10]:
unknown_cohorts.cohort_id.unique()

array([ 22,  21,  19,  16,  13,   1,  18,  14,  15,   7,   4,  12,  17,
         8,  23,   2,   9,  11,  24,  25,  26,   6,  27,  28,  29,  31,
        32,  33,  34,  51,  52,  53,  55,  56,  57,   5,  58,  59,  61,
        62, 132, 134, 133, 135, 138, 137, 139])

In [11]:
df.isnull().sum()

date               0
time               0
path               1
user_id            0
cohort_id          0
ip                 0
id            774925
name          774925
slack         774925
start_date    774925
end_date      774925
created_at    774925
updated_at    774925
deleted_at    847330
program_id    774925
dtype: int64

In [12]:
df.shape

(847330, 15)

### Prepare

**Prepare summary**

##### List of things my clean_cohort_data function does.
- Combined the date and time column into a datetime, and converted the datatype to datetime.
- Placed my new datetime column to the index for later use.
- Replaced the program_id with its corresponding program name and subdomain.
- Dropped unnecessary columns 'date','time','deleted_at','program_id','id'.
- Renamed 'name' to 'cohort_name' for my personal readability.
- Created a dictionary to map cohort name to cohort id.
- Filled nulls in cohort_id.

In [2]:
df = wrangle.wrangle_cohort_data()

In [3]:
df.isnull().sum()

date                 0
time                 0
path                 0
user_id              0
cohort_id            0
ip                   0
name                 0
start_date           0
end_date             0
program_id           0
program_name         0
program_subdomain    0
dtype: int64

In [5]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,name,start_date,end_date,program_id,program_name,program_subdomain
0.0,2018-01-26,09:55:03,/,1.0,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1,PHP Full Stack Web Development,php
1.0,2018-01-26,09:56:02,java-ii,1.0,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1,PHP Full Stack Web Development,php
2.0,2018-01-26,09:56:05,java-ii/object-oriented-programming,1.0,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1,PHP Full Stack Web Development,php
3.0,2018-01-26,09:56:06,slides/object_oriented_programming,1.0,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1,PHP Full Stack Web Development,php
58.0,2018-01-26,10:40:15,javascript-i/functions,1.0,8.0,97.105.19.61,Hampton,2015-09-22,2016-02-06,1,PHP Full Stack Web Development,php


In [4]:
df.shape

(847329, 12)

In [13]:
df = prepare.clean_cohort_data(df)

In [14]:
df.isnull().sum()

path                      1
user_id                   0
cohort_id                 0
ip                        0
cohort_name          774925
slack                774925
start_date           774925
end_date             774925
created_at           774925
updated_at           774925
program_name         774925
program_subdomain    774925
count_helper              0
split_path                1
dtype: int64

In [15]:
df.shape

(847330, 14)

In [16]:
df.head()

Unnamed: 0_level_0,path,user_id,cohort_id,ip,cohort_name,slack,start_date,end_date,created_at,updated_at,program_name,program_subdomain,count_helper,split_path
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-01-26 09:55:03,/,1,8,97.105.19.61,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,PHP Full Stack Web Development,php,1,"[, ]"
2018-01-26 09:56:02,java-ii,1,8,97.105.19.61,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,PHP Full Stack Web Development,php,1,[java-ii]
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,PHP Full Stack Web Development,php,1,"[java-ii, object-oriented-programming]"
2018-01-26 09:56:06,slides/object_oriented_programming,1,8,97.105.19.61,Arches,#arches,2014-02-04,2014-04-22,2016-06-14 19:52:26,2016-06-14 19:52:26,PHP Full Stack Web Development,php,1,"[slides, object_oriented_programming]"
2018-01-26 09:56:24,javascript-i/conditionals,2,22,97.105.19.61,Badlands,#badlands,2014-06-04,2014-08-22,2016-06-14 19:52:26,2016-06-14 19:52:26,PHP Full Stack Web Development,php,1,"[javascript-i, conditionals]"


In [17]:
php_df = df[df.program_subdomain == 'php']

In [18]:
php_df.groupby('path').count_helper.value_counts(normalize = True)

path                      count_helper
/                         1               1.0
00_                       1               1.0
00_index                  1               1.0
01_intro                  1               1.0
02_listing_files          1               1.0
                                         ... 
web-design                1               1.0
web-design/intro          1               1.0
web-design/ui/color       1               1.0
web-design/ui/typography  1               1.0
web-design/ux/layout      1               1.0
Name: count_helper, Length: 1265, dtype: float64

In [19]:
java_df = df[df.program_subdomain == 'java']

In [20]:
java_df.groupby('path').count_helper.value_counts(normalize = True)

path                                      count_helper
/                                         1               1.0
1-fundamentals/1-fundamentals-overview    1               1.0
1-fundamentals/1.1-intro-to-data-science  1               1.0
1-fundamentals/1.2-data-science-pipeline  1               1.0
1-fundamentals/1.3-pipeline-demo          1               1.0
                                                         ... 
web-design/ui/color                       1               1.0
web-design/ui/typography                  1               1.0
web-design/ui/visuals                     1               1.0
web-design/ux/layout                      1               1.0
web-design/ux/purpose                     1               1.0
Name: count_helper, Length: 614, dtype: float64

In [21]:
ds_df = df[df.program_subdomain == 'ds']

In [22]:
ds_df.path.groupby().value_counts(normalize = True)

TypeError: You have to supply one of 'by' and 'level'

In [None]:
php_df.groupby('path').sort_values(ascending = False)

In [None]:
df.program_subdomain.value_counts()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.shape

Dataframe has 13 columns and 73739 rows after cleaning.

## Questions from zach

### 1. Which lesson appears to attract the most traffic consistently across cohorts (per program)?

In [None]:
df.groupby('program_subdomain').count_helper.sum()

In [None]:
plt.figure(figsize = (14,8))
df.groupby('program_subdomain').count_helper.sum().plot(color = 'r')
plt.show()

In [None]:
# Looks like the PHP Full Stack Web Development has the most traffic.

In [None]:
sample_df = df[:20]

In [None]:
df.split_path.str[0]

### 2. Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?

In [None]:
def path_value_counts(df):
    for i in range(1,8):
        lesson_df = df[df.split_path.str.len() == i]
        print(lesson_df.groupby('cohort_name').path.value_counts(normalize=True).sort_values())
        print('\n')
        print(f'{i}')
        print('--------------------------')
        print('\n')

In [None]:
path_value_counts(df)

In [None]:
lesson_df = df[df.split_path.str.len() == 3]

In [None]:
lesson_df.shape

In [None]:
lesson_df.groupby('cohort_name').path.value_counts(normalize=True).sort_values()

# Not Finished

### 3. Are there students who, when active, hardly access the curriculum? If so, what information do you have about these students?

In [None]:
df.user_id.nunique()

In [None]:
df.groupby('user_id').count_helper.sum().sort_values()

In [None]:
df.groupby('user_id').count_helper.sum().value_counts(normalize = True)

### 4. Is there any suspicious activity, such as users/machines/etc accessing the curriculum who shouldn’t be? Does it appear that any web-scraping is happening? Are there any suspicious IP addresses?

### 5. At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?

### 6. What topics are grads continuing to reference after graduation and into their jobs (for each program)?

### 7. Which lessons are least accessed?

### 8. Anything else I should be aware of?