In [2]:
import pandas as pd

# Load and Clean Data

In [30]:
# Load exploratory data
# fname = '../osfstorage-archive/upworthy-archive-datasets/upworthy-archive-exploratory-packages-03.12.2020.csv'
fname = '../output/armando_with_all_features.csv'
df = pd.read_csv(fname)

In [31]:
ner_keys = ['LOC', 'PERSON', 'WORK_OF_ART','LANGUAGE','PRODUCT','QUANTITY','DATE','CARDINAL','GPE','TIME','ORG','LAW','MONEY','FAC','EVENT','PERCENT','NORP','ORDINAL']
ner_cols = [f'{k}_start' for k in ner_keys] + [f'{k}_end' for k in ner_keys]

['LOC_start',
 'PERSON_start',
 'WORK_OF_ART_start',
 'LANGUAGE_start',
 'PRODUCT_start',
 'QUANTITY_start',
 'DATE_start',
 'CARDINAL_start',
 'GPE_start',
 'TIME_start',
 'ORG_start',
 'LAW_start',
 'MONEY_start',
 'FAC_start',
 'EVENT_start',
 'PERCENT_start',
 'NORP_start',
 'ORDINAL_start',
 'LOC_end',
 'PERSON_end',
 'WORK_OF_ART_end',
 'LANGUAGE_end',
 'PRODUCT_end',
 'QUANTITY_end',
 'DATE_end',
 'CARDINAL_end',
 'GPE_end',
 'TIME_end',
 'ORG_end',
 'LAW_end',
 'MONEY_end',
 'FAC_end',
 'EVENT_end',
 'PERCENT_end',
 'NORP_end',
 'ORDINAL_end']

In [32]:
# Remove useless columns
other_features = ['hl_len', 'neg', 'neu', 'pos'] + ner_cols
df = df[['headline','impressions', 'clicks','test_week'] + other_features]

df.sample(5)

Unnamed: 0,headline,impressions,clicks,test_week,hl_len,neg,neu,pos,LOC_start,PERSON_start,...,GPE_end,TIME_end,ORG_end,LAW_end,MONEY_end,FAC_end,EVENT_end,PERCENT_end,NORP_end,ORDINAL_end
38987,The Episode Of 'Orange Is The New Black' That ...,5172,77,2014-06-16,81,0.925863,0.07051,0.003628,0,0,...,0,0,41,0,0,0,0,0,0,0
40966,I Hope You Don't Make One Of These Silly Mista...,4099,39,2014-06-30,85,0.887503,0.107867,0.00463,0,0,...,0,0,0,0,0,0,0,0,0,0
180,The National Frozen Pizza Institute exists. An...,3605,6,2015-02-23,75,0.881179,0.114523,0.004299,0,0,...,0,0,36,0,0,0,0,0,0,0
56024,Attention: Women Were Never 'Given' The Right ...,3201,20,2014-11-03,74,0.506935,0.48578,0.007285,0,0,...,0,0,0,0,0,0,0,0,0,0
26226,We’re Living With Something That’s All Amped U...,6129,22,2014-09-01,88,0.979228,0.017553,0.003219,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Description
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66972 entries, 0 to 66971
Data columns (total 44 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   headline           66972 non-null  object 
 1   impressions        66972 non-null  int64  
 2   clicks             66972 non-null  int64  
 3   test_week          66972 non-null  object 
 4   hl_len             66972 non-null  int64  
 5   neg                66972 non-null  float64
 6   neu                66972 non-null  float64
 7   pos                66972 non-null  float64
 8   LOC_start          66972 non-null  int64  
 9   PERSON_start       66972 non-null  int64  
 10  WORK_OF_ART_start  66972 non-null  int64  
 11  LANGUAGE_start     66972 non-null  int64  
 12  PRODUCT_start      66972 non-null  int64  
 13  QUANTITY_start     66972 non-null  int64  
 14  DATE_start         66972 non-null  int64  
 15  CARDINAL_start     66972 non-null  int64  
 16  GPE_start          669

In [36]:
# Correctly type the data
strings = ['headline']
datetimes = ['test_week']

df[strings] = df[strings].astype('string')
df[datetimes] = df[datetimes].astype('string')

# Aggregate the same packages
aggregation_functions = {'clicks': 'sum', 'impressions': 'sum'}
aggregation_functions.update({k: 'first' for k in other_features}) # 'significance': 'avg'}
df = df.groupby(['headline', 'test_week'], as_index=False, observed=True).aggregate(aggregation_functions)

# Replace clicks and impressions with ctr (click-through-rate)
df['ctr'] = df['clicks']/df['impressions']
df = df.drop(columns=['clicks', 'impressions'])

# Description
df.info()

{'clicks': 'sum', 'impressions': 'sum', 'hl_len': 'first', 'neg': 'first', 'neu': 'first', 'pos': 'first', 'LOC_start': 'first', 'PERSON_start': 'first', 'WORK_OF_ART_start': 'first', 'LANGUAGE_start': 'first', 'PRODUCT_start': 'first', 'QUANTITY_start': 'first', 'DATE_start': 'first', 'CARDINAL_start': 'first', 'GPE_start': 'first', 'TIME_start': 'first', 'ORG_start': 'first', 'LAW_start': 'first', 'MONEY_start': 'first', 'FAC_start': 'first', 'EVENT_start': 'first', 'PERCENT_start': 'first', 'NORP_start': 'first', 'ORDINAL_start': 'first', 'LOC_end': 'first', 'PERSON_end': 'first', 'WORK_OF_ART_end': 'first', 'LANGUAGE_end': 'first', 'PRODUCT_end': 'first', 'QUANTITY_end': 'first', 'DATE_end': 'first', 'CARDINAL_end': 'first', 'GPE_end': 'first', 'TIME_end': 'first', 'ORG_end': 'first', 'LAW_end': 'first', 'MONEY_end': 'first', 'FAC_end': 'first', 'EVENT_end': 'first', 'PERCENT_end': 'first', 'NORP_end': 'first', 'ORDINAL_end': 'first'}
<class 'pandas.core.frame.DataFrame'>
RangeInde

In [37]:
# Summary
df.describe()

Unnamed: 0,hl_len,neg,neu,pos,LOC_start,PERSON_start,WORK_OF_ART_start,LANGUAGE_start,PRODUCT_start,QUANTITY_start,...,TIME_end,ORG_end,LAW_end,MONEY_end,FAC_end,EVENT_end,PERCENT_end,NORP_end,ORDINAL_end,ctr
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,80.731483,0.388961,0.424157,0.186882,0.259483,4.2603,3.779433,0.025683,0.1085,0.040333,...,0.61565,5.161067,0.220483,0.346967,0.17165,0.367533,0.251683,0.9383,0.46765,0.013374
std,15.111054,0.39691,0.350055,0.31853,4.052521,14.780461,13.624609,1.193921,2.665992,1.461167,...,6.265196,17.59176,3.905107,4.462579,3.44209,4.799413,3.735292,7.33142,5.266287,0.011263
min,4.0,0.000644,0.0051,0.002589,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,71.0,0.005941,0.072735,0.004743,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005997
50%,83.0,0.206177,0.349617,0.015562,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010165
75%,94.0,0.846044,0.785455,0.175372,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017009
max,140.0,0.985721,0.983769,0.992851,96.0,109.0,109.0,93.0,95.0,88.0,...,101.0,112.0,101.0,101.0,101.0,101.0,101.0,107.0,101.0,0.148913


In [38]:
df # Unique Packages

Unnamed: 0,headline,test_week,hl_len,neg,neu,pos,LOC_start,PERSON_start,WORK_OF_ART_start,LANGUAGE_start,...,TIME_end,ORG_end,LAW_end,MONEY_end,FAC_end,EVENT_end,PERCENT_end,NORP_end,ORDINAL_end,ctr
0,"\nLike Downton Abbey? Well, There’s Another Dr...",2013-06-03,98,0.911010,0.085331,0.003659,0,7,81,0,...,0,77,0,0,0,0,0,0,0,0.016213
1,"\nWell I Gotta Give It To Him, That's A Pretty...",2013-06-17,86,0.474779,0.414655,0.110566,0,0,38,0,...,0,0,0,0,0,0,0,0,0,0.019236
2,\nYet Another Anti-Gay Guy Saying Gays Should…...,2013-06-17,78,0.925382,0.068099,0.006519,0,6,0,0,...,0,77,0,0,0,0,0,0,0,0.019155
3,'Be The Change You Want To See.' It's An Insp...,2013-06-17,94,0.001423,0.246190,0.752387,0,0,40,0,...,0,0,0,0,0,0,0,0,0,0.014886
4,'This Land Was Made For You And Me' — Or Was It?,2013-08-05,49,0.001603,0.827009,0.171388,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.012906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,“Success Means Winning First Place” And Other ...,2014-04-28,65,0.737020,0.240421,0.022559,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0.010306
59996,“There Is Nothing More Dangerous Than A Govern...,2014-06-30,86,0.971427,0.025523,0.003051,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0.007223
59997,"“You Make A Dime, They Take A Dollar.” Here's ...",2014-09-22,98,0.155437,0.836000,0.008563,0,0,2,0,...,0,98,0,0,0,0,0,0,0,0.004309
59998,“You’re Rude. I Save Lives.”,2014-04-28,28,0.809876,0.177341,0.012783,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0.030717


In [39]:
output = '../output/armando_all_features_global_prediction.csv'
# output = '../output/packages.csv'
df.to_csv(output, index=False)