In [1]:
import pandas as pd

# Load and Clean Data

In [2]:
# Load exploratory data
# fname = '../osfstorage-archive/upworthy-archive-datasets/upworthy-archive-exploratory-packages-03.12.2020.csv'
# fname = '../materials/packages.csv'
df = pd.read_csv(fname, index_col=0)

In [3]:
# Rename columns 
df = df.rename(columns={'clickability_test_id': 'test_id', 'eyecatcher_id': 'image_id'})

# Remove useless columns
df = df[['test_id', 'headline', 'image_id', 'impressions', 'clicks', 'first_place', 'winner', 'test_week']]

df.sample(5)

Unnamed: 0,test_id,headline,image_id,impressions,clicks,first_place,winner,test_week
27231,541b2c1ce906a0d2090000a3,If Everyone In The World Had As Big A Heart As...,541b415ae906a00f4a0000a5,4889,33,False,False,201440
96007,5420b4e2ac4191ecb200000c,Watch Her Tell Congress That She’s Not A Welfa...,542093ef02c5bfd15c000035,7020,28,False,False,201438
1296,54909d1e6264660012340000,The First 26 Seconds Are Just Upsetting. But A...,54909e656264660021300000,3625,24,True,False,201450
88316,538547030e2422c037000007,Watch Strangers Step In When A Man Physically ...,53850cbb8d1e6ea0fc000052,4171,65,False,False,201421
101629,53c468ac76a1348576000063,The Most Christian Way To Respond When Your Fr...,53c4675b76a134f85900004d,3122,17,False,False,201428


In [4]:
# Description
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22666 entries, 0 to 150816
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   test_id      22666 non-null  object
 1   headline     22666 non-null  object
 2   image_id     22644 non-null  object
 3   impressions  22666 non-null  int64 
 4   clicks       22666 non-null  int64 
 5   first_place  22666 non-null  bool  
 6   winner       22666 non-null  bool  
 7   test_week    22666 non-null  int64 
dtypes: bool(2), int64(3), object(3)
memory usage: 1.3+ MB


In [5]:
# Correctly type the data
categories = ['test_id', 'image_id']
binaries = ['first_place', 'winner']
strings = ['headline']

df[categories] = df[categories].astype('category')
df[binaries] = df[binaries].astype('int')
df[strings] = df[strings].astype('string')

# Aggregate the same packages
aggregation_functions = {'clicks': 'sum', 'impressions': 'sum', 'first_place': 'max', 'winner': 'max', 'test_week': 'first'} # 'significance': 'avg'}
df = df.groupby(['test_id', 'image_id', 'headline'], as_index=False, observed=True).aggregate(aggregation_functions)

# Replace clicks and impressions with ctr (click-through-rate)
df['ctr'] = df['clicks']/df['impressions']
df = df.drop(columns=['clicks', 'impressions'])

# Description
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19161 entries, 0 to 19160
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   test_id      19161 non-null  category
 1   image_id     19161 non-null  category
 2   headline     19161 non-null  object  
 3   first_place  19161 non-null  int64   
 4   winner       19161 non-null  int64   
 5   test_week    19161 non-null  int64   
 6   ctr          19161 non-null  float64 
dtypes: category(2), float64(1), int64(3), object(1)
memory usage: 1.3+ MB


In [6]:
# Summary
df.describe()

Unnamed: 0,first_place,winner,test_week,ctr
count,19161.0,19161.0,19161.0,19161.0
mean,0.251448,0.06007,201416.072282,0.01504
std,0.433857,0.237623,55.969493,0.01201
min,0.0,0.0,201303.0,0.0
25%,0.0,0.0,201401.0,0.007022
50%,0.0,0.0,201428.0,0.011828
75%,1.0,0.0,201443.0,0.019385
max,1.0,1.0,201517.0,0.136063


In [7]:
df # Unique Packages

Unnamed: 0,test_id,image_id,headline,first_place,winner,test_week,ctr
0,546d88fb84ad38b2ce000024,546d6fa19ad54eec8d00002d,They're Being Called 'Walmart's Worst Nightmar...,1,1,201446,0.042148
1,546d902c26714c6c44000039,546bc55335992b86c8000043,This Is What Sexism Against Men Sounds Like,0,0,201446,0.025525
2,546d902c26714c6c44000039,546d900426714cd2dd00002e,This Is What Sexism Against Men Sounds Like,1,0,201446,0.034227
3,546d902c26714c6c44000039,546d900426714c6c44000038,This Is What Sexism Against Men Sounds Like,0,0,201446,0.028994
4,546d902c26714c6c44000039,546d900426714c1ad900001e,This Is What Sexism Against Men Sounds Like,0,0,201446,0.021645
...,...,...,...,...,...,...,...
19156,546d082afd36176c99000049,546bc55335992b614b000046,"What Sexism Against Men Looks Like, With Your ...",0,0,201446,0.017954
19157,546c503abadeb51585000003,546c567f545240b352000009,"A Pen, Paper, And Watercolors Tell One Of The ...",0,0,201446,0.010920
19158,546c503abadeb51585000003,546c567f545240b352000009,An Artist Animates The Ugly Truth We Should Al...,0,0,201446,0.009219
19159,546c503abadeb51585000003,546c567f545240b352000009,Ferguson Has Taught Us Many Things. Here's The...,0,0,201446,0.007845


In [8]:
# Number of Tests
df.groupby(['test_id']).ngroups

4873

In [9]:
output = '../output/exploratory.csv'
# output = '../output/packages.csv'
df.to_csv(output)