In [1]:
import os
import numpy as np
import pandas as pd
import json
import itertools
import pickle
import matplotlib.pyplot as plt
import torch
from sentence_transformers import SentenceTransformer, util

## Prepare data

### Selecte data relevant to federal races

In [4]:
source_path = '../../../../fb_2022'

In [5]:
var = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_var_sets_091224.csv.gz'))

  var = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_var_sets_091224.csv.gz'))


In [6]:
var.columns

Index(['ad_id', 'page_id', 'ad_creation_time', 'ad_delivery_start_time',
       'ad_delivery_stop_time', 'spend', 'impressions', 'region_distribution',
       'demographic_distribution', 'publisher_platforms', 'potential_reach',
       'from_pd_id', 'wmp_media_type', 'aws_face_vid', 'aws_face_img',
       'sub_bucket', 'race_of_focus', 'race_of_focus_region_pct',
       'ad_tone_constructed', 'ad_tone_mentionbased', 'goal_DONATE_prediction',
       'goal_DONATE_predicted_prob', 'goal_CONTACT_prediction',
       'goal_CONTACT_predicted_prob', 'goal_PURCHASE_prediction',
       'goal_PURCHASE_predicted_prob', 'goal_GOTV_prediction',
       'goal_GOTV_predicted_prob', 'goal_EVENT_prediction',
       'goal_EVENT_predicted_prob', 'goal_POLL_prediction',
       'goal_POLL_predicted_prob', 'goal_GATHERINFO_prediction',
       'goal_GATHERINFO_predicted_prob', 'goal_LEARNMORE_prediction',
       'goal_LEARNMORE_predicted_prob', 'goal_PRIMARY_PERSUADE_prediction',
       'goal_PRIMARY_PERSUADE_

In [7]:
'''
Filter for Set 3
'''
var = var[var.set3 == 1]

In [8]:
var.shape

(123997, 73)

In [9]:
var[~var.race_of_focus.isin(['No race of focus', 'Downballot'])].sub_bucket.unique()

array(['3.1', '3.2.2.2', '3.2.2.1', '1'], dtype=object)

In [10]:
'''
Only house/senate races 
'''
condition = (var.wmp_office.isin(['us senate', 'us house'])) | (~var.race_of_focus.isin(['No race of focus', 'Downballot']))

In [11]:
var = var[condition]

In [12]:
var.shape

(96535, 73)

In [13]:
var.groupby(['wmp_spontype']).size()

wmp_spontype
campaign               54766
coordinated              343
down ballot                1
government agency        358
government official      847
group                  28497
party                   3109
party national          3613
dtype: int64

In [14]:
'''
Merge with text and cid mapping tables
'''
cid = pd.read_csv(os.path.join(source_path, 'cid_fb2022.csv'))
text = pd.read_csv(os.path.join(source_path, 'fb_2022_adid_text.csv.gz'))

In [15]:
var = var.merge(text, how='left', on='ad_id')

In [16]:
var = var.merge(cid, how='left', on='ad_id')

In [17]:
var[pd.isna(var.federal_cd)].shape

(355, 94)

In [18]:
'''
Drop clearly non-federal race spnosors
'''
var = var[var.wmp_spontype!= 'down ballot']

### Aggregate data at sponsor-media level

#### Federal candidates

In [19]:
var_cand = var[var.federal_cd==1]

In [20]:
cols = ['pd_id', 'wmp_media_type', 'cid']

In [21]:
# Aggregate by pdid AND media type
cand_sponsor_agg = var_cand[cols].groupby(cols).first().reset_index().groupby(['pd_id', 'wmp_media_type']).agg(lambda x: ' '.join(x)).reset_index()

In [22]:
cand_sponsor_agg.head(2)

Unnamed: 0,pd_id,wmp_media_type,cid
0,pd-1000253160054572-3,image,cid_106784 cid_108932 cid_108933 cid_85818 cid...
1,pd-1000253160054572-3,video,cid_108054 cid_108055 cid_85825


In [23]:
cand_sponsor_agg['num_unique'] = cand_sponsor_agg.cid.apply(lambda x: len(x.split()))

In [24]:
cand_sponsor_agg = cand_sponsor_agg[cand_sponsor_agg.num_unique > 1]

In [25]:
var_cand.wmp_spontype.unique()

array(['campaign'], dtype=object)

#### Non-campaign sponsors: Advertiser-media-race_of_focuse-level aggregation

In [26]:
var[var.federal_cd == 0].wmp_spontype.unique()

array(['government agency', 'group', nan, 'campaign',
       'government official', 'party national', 'party', 'coordinated'],
      dtype=object)

In [27]:
var[var.federal_cd == 0].wmp_office.unique()

array([nan, 'down ballot', 'us house', 'ballot measure',
       'us senate - other cycle', 'us senate',
       'us house - not in our 2022 FEC cd file',
       'president - other cycle', 'us house - other cycle', 'governor',
       'election outside US or in US comm/territory', 'group'],
      dtype=object)

In [28]:
'''
filter irrelevant offices among non-campaign sponsors
'''
irrelevant_offices = ['us house - other cycle', 'us senate - other cycle', 'down ballot', 
 'president - other cycle', 'ballot measure', 'election outside US or in US comm/territory', 'governor']
condition3 = (var.federal_cd == 0) & (~var.wmp_office.isin(irrelevant_offices))

var_noncand = var[condition3]

In [None]:
# var_noncand[var.wmp_spontype == 'government agency'][['race_of_focus', 'advertiser_name', 'ad_type']]

In [29]:
'''
Remove those with no race of focus identified
'''
var_noncand = var_noncand[~pd.isna(var_noncand.race_of_focus)]
var_noncand = var_noncand[var_noncand.race_of_focus != 'No race of focus']

In [30]:
var_noncand.shape

(39415, 94)

In [31]:
'''
Aggregate at sponsor-media-race level
'''
cols2 = ['pd_id', 'cid', 'race_of_focus', 'wmp_media_type']

In [32]:
'''
Agg by pdid AND media type AND race of focus
'''
noncand_sponsor_agg = var_noncand[cols2].groupby(['pd_id', 'cid', 'race_of_focus']).first().reset_index()\
.groupby(['pd_id', 'race_of_focus', 'wmp_media_type']).agg(lambda x: ' '.join(x)).reset_index()

In [33]:
noncand_sponsor_agg.head(2)

Unnamed: 0,pd_id,race_of_focus,wmp_media_type,cid
0,pd-100053898949-2,CA11,image,cid_127823 cid_66514
1,pd-100053898949-2,IN08,image,cid_130676


In [34]:
noncand_sponsor_agg['num_unique'] = noncand_sponsor_agg.cid.apply(lambda x: len(x.split()))

# filter for sponsors with at least a pair of unique creatives
noncand_sponsor_agg = noncand_sponsor_agg[noncand_sponsor_agg.num_unique > 1]

In [35]:
noncand_sponsor_agg.shape

(3270, 5)

### Import trained corpous embedding and compute average similarity

In [36]:
corpus_embeddings = torch.load('../input_data/corpus_embedding_fb2022_uni.pt', map_location=torch.device('cpu'))

In [37]:
corpus_embeddings.shape
# corpus embedding index aligned with ascending cid index starting from 0 

torch.Size([153952, 384])

In [38]:
def compute_avg_similarity(cid_lst):
    cid_lst = cid_lst.split()
    
    pairs = list(itertools.combinations(cid_lst, 2))
    if pairs:
        scores = []
        for pair in pairs: 
            cid1, cid2 = pair
            idx1 = int(cid1.lstrip('cid_'))
            idx2 = int(cid2.lstrip('cid_'))
            
            score = util.cos_sim(corpus_embeddings[idx1], corpus_embeddings[idx2])[0][0].numpy()
            scores.append(score)
        return np.array(scores).mean(), np.array(scores).std()
    return None

### Federal candidates: by advertiser and media type¶

In [39]:
cand_sponsor_agg.loc[:, ['avg', 'std']] = cand_sponsor_agg.cid.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [40]:
cand_sponsor_agg.head(4)

Unnamed: 0,pd_id,wmp_media_type,cid,num_unique,avg,std
0,pd-1000253160054572-3,image,cid_106784 cid_108932 cid_108933 cid_85818 cid...,7,0.679787,0.221322
1,pd-1000253160054572-3,video,cid_108054 cid_108055 cid_85825,3,0.939069,0.043084
2,pd-100112769260413-1,image,cid_118994 cid_29989 cid_61946 cid_67361 cid_7...,7,0.563807,0.086565
3,pd-100112769260413-1,video,cid_122019 cid_81062,2,0.584833,0.0


In [41]:
'''
Candidates only
'''

cand_sponsor_agg.to_csv('../output_data/fb_set3_cand_pdid_media_average_pairwise_similarity.csv', index=False)

### non-campaign candidate sponsors: by sponsor - media - race of focus


In [42]:
noncand_sponsor_agg.loc[:, ['avg', 'std']] = noncand_sponsor_agg.cid.apply(lambda x: pd.Series(compute_avg_similarity(x), index=['avg', 'std'], dtype="float"))

In [43]:
'''
Got the final avg text similarity of noncandidates grouped by race of focus average pairwise similarity
'''
noncand_sponsor_agg.to_csv('../output_data/fb_set3_noncandidate_pdid_racefocus_media_average_pairwise_similarity.csv', index=False)