In [18]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, ttest_ind

In [3]:
df = pd.read_csv("traffic.csv")

In [4]:
df.head()

Unnamed: 0,event,date,country,city,artist,album,track,isrc,linkid
0,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
1,click,2021-08-21,Saudi Arabia,Jeddah,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8
2,click,2021-08-21,India,Ludhiana,Reyanna Maria,So Pretty,So Pretty,USUM72100871,23199824-9cf5-4b98-942a-34965c3b0cc2
3,click,2021-08-21,France,Unknown,"Simone & Simaria, Sebastian Yatra",No Llores Más,No Llores Más,BRUM72003904,35573248-4e49-47c7-af80-08a960fa74cd
4,click,2021-08-21,Maldives,Malé,Tesher,Jalebi Baby,Jalebi Baby,QZNWQ2070741,2d896d31-97b6-4869-967b-1c5fb9cd4bb8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226278 entries, 0 to 226277
Data columns (total 9 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   event    226278 non-null  object
 1   date     226278 non-null  object
 2   country  226267 non-null  object
 3   city     226267 non-null  object
 4   artist   226241 non-null  object
 5   album    226273 non-null  object
 6   track    226273 non-null  object
 7   isrc     219157 non-null  object
 8   linkid   226278 non-null  object
dtypes: object(9)
memory usage: 15.5+ MB


In [6]:
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date', ascending=True)

In [7]:
df.event.unique()

array(['pageview', 'click', 'preview'], dtype=object)

#### How many total pageview events did the links in the provided dataset receive in the full period, how many per day?

Total per day

In [8]:
df.query("event == 'pageview'").groupby(df.date.dt.date).size()

date
2021-08-19    22366
2021-08-20    21382
2021-08-21    21349
2021-08-22    20430
2021-08-23    18646
2021-08-24    18693
2021-08-25    19149
dtype: int64

#### What about the other recorded events?

In [9]:
df.query("event in ['click', 'preview'] ").groupby(df.date.dt.date).size()

date
2021-08-19    12995
2021-08-20    12730
2021-08-21    12734
2021-08-22    12203
2021-08-23    11162
2021-08-24    11141
2021-08-25    11298
dtype: int64

#### Which countries did the pageviews come from?



In [12]:
country_page_views = df['country'].value_counts().reset_index()
country_page_views.columns = ['country', 'page_views']
country_page_views

Unnamed: 0,country,page_views
0,Saudi Arabia,47334
1,India,42992
2,United States,32558
3,France,15661
4,Iraq,8260
...,...,...
206,Wallis and Futuna,2
207,Saint Martin,2
208,Solomon Islands,2
209,Sint Maarten,1


#### What was the overall click rate (clicks/pageviews)?

In [15]:
total_pageviews = df.shape[0]

click_rate = total_clicks / total_pageviews
click_rate

1.0

#### How does the clickrate distribute across different links?

In [16]:
link_clicks = df['linkid'].value_counts().reset_index()
link_clicks.columns = ['linkid', 'clicks']

total_pageviews = df.shape[0]
link_clicks['clickrate'] = link_clicks['clicks'] / total_pageviews

link_clicks.head()

Unnamed: 0,linkid,clicks,clickrate
0,2d896d31-97b6-4869-967b-1c5fb9cd4bb8,40841,0.18049
1,522da5cc-8177-4140-97a7-a84fdb4caf1c,10314,0.045581
2,e849515b-929d-44c8-a505-e7622f1827e9,9750,0.043089
3,c2c876ab-b093-4750-9449-6b4913da6af3,6733,0.029755
4,681d888a-59ce-4acb-b7c5-95dab0c063d9,5512,0.024359


#### Is there any correlation between clicks and previews on a link? Is it significant? How large is the effect? Make sure to at least test for potential linear as well as categorical (think binary) relationships between both variables.

In [19]:
link_clicks = df['linkid'].value_counts().reset_index()
link_clicks.columns = ['linkid', 'clicks']

link_clicks['previews'] = link_clicks['clicks']

correlation, p_value = pearsonr(link_clicks['clicks'], link_clicks['previews'])

median_previews = link_clicks['previews'].median()
link_clicks['high_previews'] = np.where(link_clicks['previews'] > median_previews, 1, 0)

high_previews_clicks = link_clicks[link_clicks['high_previews'] == 1]['clicks']
low_previews_clicks = link_clicks[link_clicks['high_previews'] == 0]['clicks']

t_stat, p_value_cat = ttest_ind(high_previews_clicks, low_previews_clicks)

correlation, p_value, t_stat, p_value_cat, high_previews_clicks.mean(), low_previews_clicks.mean()

(0.9999999999999695,
 0.0,
 6.513935663231871,
 8.269689666893291e-11,
 166.9707426856714,
 1.4788507581803672)