In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

import acquire
import prepare

# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

In [2]:
df = pd.read_csv('anonymized-curriculum-access-07-2021.txt',
                      engine='python',
                     header=None,
                     index_col=False,
                     sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                     na_values='"-"',
                     )

In [3]:
df.columns = ['date','time','page_viewed','user_id','cohort_id','ip']

In [4]:
df['cohort_id'] = df.cohort_id.fillna(0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018810 entries, 0 to 1018809
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   date         1018810 non-null  object 
 1   time         1018810 non-null  object 
 2   page_viewed  1018809 non-null  object 
 3   user_id      1018810 non-null  object 
 4   cohort_id    1018810 non-null  float64
 5   ip           1018809 non-null  object 
dtypes: float64(1), object(5)
memory usage: 46.6+ MB


In [6]:
# Drop the nulls
df = df.dropna()

In [7]:
# Create a datetime column by concatenating date and time
#df.index = pd.to_datetime(df.date + " " + df.time)

In [8]:
df.head()

Unnamed: 0,date,time,page_viewed,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [9]:
#df = df.drop(['date', 'time'], axis = 1)

In [10]:
df.head()

Unnamed: 0,date,time,page_viewed,user_id,cohort_id,ip
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61


In [11]:
id_counts = df.groupby(['user_id'])['date', 'cohort_id','page_viewed'].nunique()
cohort_counts = df.groupby(['cohort_id'])['page_viewed','user_id', 'date'].nunique()
first_access = df.groupby(['user_id'])['date'].min()
id_by_first_access_date = pd.DataFrame({'first_access_date': first_access}).reset_index().groupby('first_access_date').count()

In [12]:
id_by_first_access_date

Unnamed: 0_level_0,user_id
first_access_date,Unnamed: 1_level_1
2018-01-26,50
2018-01-27,2
2018-01-28,2
2018-01-29,12
2018-01-30,7
...,...
2021-06-28,23
2021-07-04,2
2021-07-06,2
2021-07-12,1


In [77]:
page_views = df.groupby(['page_viewed'])['user_id'].agg(['count','nunique'])
observed = page_views.sort_values(by = 'count', ascending = True)
observed.head(15)

Unnamed: 0_level_0,count,nunique
page_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1
%20https://github.com/RaulCPena,1,1
cohorts/24/capstone-all,1,1
cohorts/24/grades,1,1
cohorts/27/quizzes,1,1
content/appendix/control-structures.html,1,1
content/appendix/javascript/functions/controllers.html,1,1
content/appendix/javascript/functions/models.html,1,1
content/appendix/javascript/functions/scope.html,1,1
content/appendix/javascript/functions/templating.html,1,1
wp-login,1,1


In [78]:
observed = observed.reset_index()

In [79]:
observed = observed[~(observed['page_viewed'].str.contains('appendix|cohorts|examples|caps|github|coding-challenges \
                                                            |advanced-topics|extra|jpeg|ico|csv|project'))]

In [80]:
observed = observed[observed['page_viewed'].str.contains('/')]

In [87]:
def outlier_calculation(df, variable):
    '''
    calcualtes the lower and upper bound to locate outliers in variables
    '''
    quartile1, quartile3 = np.percentile(df[variable], [25,75])
    IQR_value = quartile3 - quartile1
    lower = quartile1 - (3 * IQR_value)
    upper = quartile3 + (3 * IQR_value)
    '''
    returns the lowerbound and upperbound values
    '''
    print(f'For {variable} the lower bound is {lower} and  upper bound is {upper}')
    df = df[(df[variable] > lower) & (df[variable] < upper)]
    return df

In [88]:
outlier_calculation(observed, 'count')

For count the lower bound is -840.0 and  upper bound is 1127.0


Unnamed: 0,page_viewed,count,nunique
10,content/conditionals.html,1,1
11,content/control-structures-ii,1,1
26,coding-challenges/professional,1,1
47,classification/knn.md,1,1
48,cli/4-navigating-the-filesystem,1,1
...,...,...,...
2239,java-iii/mvc/accessing-data,1031,146
2243,3-sql/1-mysql-overview,1100,88
2244,stats/compare-means,1110,82
2245,fundamentals/data-science-modules.jpg,1116,94


In [71]:
observed.loc[observed['count'] > 217]

Unnamed: 0,page_viewed,count,nunique
1944,2-storytelling/2.2-create,219,50
1945,4-python/8.2-intro-to-matplotlib,219,32
1948,10-anomaly-detection/3-discrete-probabilistic-...,223,42
1949,4-python/7.4.2-series,224,26
1951,8-clustering/5.1-kmeans-part-1,227,34
...,...,...,...
2389,javascript-i/functions,8179,712
2390,javascript-i/javascript-with-html,8501,713
2391,javascript-i/introduction/working-with-data-ty...,8619,688
2403,search/search_index.json,22341,794


In [66]:
observed.page_viewed.value_counts().reset_index(name='count').query('count > 217')['index'].tolist()[-6:]

[]