In [99]:
import pandas as pd
import re 

In [100]:

# Functions for removing contractions 
contraction_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}
def expand_contractions(data,contractions_dict = contraction_dict):
  '''
    Expanding Contractions
    Arguments:
      data: textual dataset 
      contractions_dict : dictionanary containing the contractions and their replacements 
    Returns :
      clean_data : textual dataset where contractions are expanded
  '''
  # Regular expression for finding contractions
  contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
  def replace(match):
      return contractions_dict[match.group(0)]
  return contractions_re.sub(replace, data)

In [101]:
df_ad_analyst = pd.read_csv('../ad_analyst_dataset.csv')
df_ad_analyst

Unnamed: 0,ads
0,back to school a real opportunity register joi...
1,every day children across yemen and syria go t...
2,in our february issue mineral carbonation dam ...
3,become a specialist in energy renovation in bu...
4,if you think like joseph his story should spea...
...,...
6340,identity v coa iv naeu online qualifier is liv...
6341,moms everywhere are taking this simple step to...
6342,my medication wasnt covered by my insurance
6343,realm logistics pressure washing licensed insu...


In [102]:
df_counterpublics = pd.read_csv('../counterpublics_dataset.csv')
df_counterpublics 

Unnamed: 0,ads
0,we're stepping up our vital democratic engagem...
1,we know not everyone is in a position to give ...
2,our progress for canadians is powered by grass...
3,sign here if you think justin trudeau and his ...
4,instead of being focused on rebuilding our eco...
...,...
11108,an efficient mayor is not only a good manager ...
11109,unforgettable walks accessible to all\nembark ...
11110,we're at 25000 followers and still growing str...
11111,as chief meteorologist at miami's nbc 6 john m...


In [103]:
df_propublica = pd.read_csv('../propublica_dataset.csv')
df_propublica

Unnamed: 0,ads
0,Access to safe water improves health and saves...
1,"Spring its just around the corner, and at Nort..."
2,Your exceptional career in teaching starts here.
3,As COVID-19 puts online shopping into overdriv...
4,Donate now to help bring relief to those suffe...
...,...
67666,Now online! Learn more about the complexities ...
67667,"AFSC is offering a global, interconnected resp..."
67668,It’s time to end Big Tobacco’s strategic killi...
67669,Public media stations are providing enhanced s...


In [104]:
# Merge all datasets
df_all = pd.concat([df_ad_analyst,df_counterpublics,df_propublica],ignore_index=True)
df_all

Unnamed: 0,ads
0,back to school a real opportunity register joi...
1,every day children across yemen and syria go t...
2,in our february issue mineral carbonation dam ...
3,become a specialist in energy renovation in bu...
4,if you think like joseph his story should spea...
...,...
85124,Now online! Learn more about the complexities ...
85125,"AFSC is offering a global, interconnected resp..."
85126,It’s time to end Big Tobacco’s strategic killi...
85127,Public media stations are providing enhanced s...


In [105]:
df_clean = df_all.copy()

# Put text in lowercase
df_clean ['ads'] = df_clean['ads'].str.lower() 

# Expand contractions 
df_clean['ads']= df_clean['ads'].apply(lambda x:expand_contractions(x))

# Remove ponctuation 
df_clean['ads'] = df_clean['ads'].str.replace('[^\w\s]','', regex= True)

# Remove extra whitespace 
df_clean['ads'] = df_clean['ads'].str.strip()

# Remove numbers 
df_clean['ads'] = df_clean['ads'].str.replace('\d+', '', regex= True)

df_clean

Unnamed: 0,ads
0,back to school a real opportunity register joi...
1,every day children across yemen and syria go t...
2,in our february issue mineral carbonation dam ...
3,become a specialist in energy renovation in bu...
4,if you think like joseph his story should spea...
...,...
85124,now online learn more about the complexities o...
85125,afsc is offering a global interconnected respo...
85126,its time to end big tobaccos strategic killing...
85127,public media stations are providing enhanced s...


In [130]:
df_clean.to_csv('../clean_dataset.csv',index=False)

In [106]:
# Words and sequences to filter authority based ads 
authority_filters = [ 'expert', 'president','founder','ceo','teacher','police', 'college', 'universit', 'journal','state official',
                  'conclusive', 'decisive', 'approved', 'official', 'sanctioned', 'accurate', 'comprehensive', 'exhaustive', 
                  'predominant','proven', 'skilful', 'proficient', 'adept', 'qualified', 'expertise','dr', 'phd', 
                  'research','professional','scientist','academic','authorities', 'precise','reliable','professor','company',
                  'administration', 'institution', 'association' , 'commission',  'recognized',  'known', 'well known',  'well established',
                  'senator', 'governor', 'elected', 'officials', 'officer', 'mayor', 'sheriff', 'leader', 'judge', 'attorney', 'senior']

def contains_word(s, w):
  if w in s.lower() :
    return True
  else: 
    return False

In [107]:
# create a new dataframe 
df_authority = df_clean.copy()
df_authority['authority'] = 0
df_authority

Unnamed: 0,ads,authority
0,back to school a real opportunity register joi...,0
1,every day children across yemen and syria go t...,0
2,in our february issue mineral carbonation dam ...,0
3,become a specialist in energy renovation in bu...,0
4,if you think like joseph his story should spea...,0
...,...,...
85124,now online learn more about the complexities o...,0
85125,afsc is offering a global interconnected respo...,0
85126,its time to end big tobaccos strategic killing...,0
85127,public media stations are providing enhanced s...,0


In [108]:
index, authority_true = 0 , set()

# filter the ads 
for sequence in df_authority['ads']:
  for filter in authority_filters:                     
    if (contains_word(sequence, filter)):
      authority_true.add(sequence)
  index += 1 
len(authority_true)

30962

In [123]:
df_authority_true = pd.DataFrame(list(authority_true), columns=['ads']).sample(n = 25000).reset_index(drop = True)
df_authority_true.drop_duplicates(subset = ['ads'], keep = 'first', inplace = True)
df_authority_true['authority'] = 1
df_authority_true 

Unnamed: 0,ads,authority
0,for thousands of families across the south bro...,1
1,add your name now to the growing chorus of ame...,1
2,my opponent ron estes is back up on tv but his...,1
3,everyone is affected in a serious and ongoing ...,1
4,donald trumps newly emboldened cruel deportati...,1
...,...,...
24995,the texas legislature is debating how to fund ...,1
24996,yes on e amp g helps students prepare for tran...,1
24997,rsvp and share chikesia was choked and had her...,1
24998,breaking the latest poll shows that our specia...,1


In [124]:
# get ads that are more likely to not contain the authority bias 
larger_filter = [ 'expert', 'president','founder','ceo','teacher','police', 'school', 'college', 'universitie', 'journal',
                'companies','conclusive', 'decisive', 'approved', 'official', 'sanction', 'accurate', 'predominant','proven','practiced', 
                'technical', 'expert', 'skilful', 'proficient', 'adept', 'skillful', 'dr.', 'phd','state','researcher','professional',
                'scientist','practitioner','academic','authorities','precise','reliable','detail','consistent','correct','union','nation',
                'federal','department','scientist','professor',"company","campaign","league","society", "administration","court","organization",
                "university", "institution","bureau","center","agency","association","foundation" ,"commission", 'recognize', 'national',  
                'known', 'endorse', 'owner', 'mayor',  'unicef', 'sheriff', 'leader', 'judge', 'attorney',  'senior','cargiver', 'student', 
                'company', 'companies', 'indutry', 'industries', 'donate', 'activist','america', 'senator', 'sponsored', 'sponsor', 'community',
                'communities', 'governor', 'governors', 'organization', 'organisation', 'elect', 'owner', 'buisness', 'farmer', 'farmers', 'owners', 
                'families', 'family', 'worker', 'brand', 'research',  'press', 'republicain', 'republican', 'party', 'parties', 'democratic', 
                'republican',  'security', 'officer', 'canadian', 'america', "people", 'washington', 'stand', 'gov', 'govt', 'government']

In [125]:
# Filter ads that are more likely to not contain the authority bias
index, authority_true, authority_false = 0 , set(), set()

# filter the ads 
for sequence in df_authority['ads']:
  for filter in larger_filter:                     
    if (contains_word(sequence, filter)):
      authority_true.add(sequence)
  index += 1 

for sequence in df_authority['ads']:
  if sequence not in authority_true:
    authority_false.add(sequence)

In [127]:
df_authority_false = pd.DataFrame(list(authority_false), columns=['ads'])
df_authority_false.drop_duplicates(subset = ['ads'], keep = 'first', inplace = True)
df_authority_false['authority'] = 0
df_authority_false

Unnamed: 0,ads,authority
0,,0
1,vote to stop chevron is takeover of new mexico,0
2,ready to learn more about dupixent full prescr...,0
3,this is not their connecticut this is your con...,0
4,it took thousands of hours of work by dedicate...,0
...,...,...
23866,do not miss out on important news if you are i...,0
23867,michigan gubernatorial candidate shri thanedar...,0
23868,in emergency situations female hygiene is ofte...,0
23869,today is womensequalityday use your voice an...,0


In [128]:
authority_dataset = pd.concat([df_authority_false, df_authority_true], ignore_index = True)
authority_dataset

Unnamed: 0,ads,authority
0,,0
1,vote to stop chevron is takeover of new mexico,0
2,ready to learn more about dupixent full prescr...,0
3,this is not their connecticut this is your con...,0
4,it took thousands of hours of work by dedicate...,0
...,...,...
48866,the texas legislature is debating how to fund ...,1
48867,yes on e amp g helps students prepare for tran...,1
48868,rsvp and share chikesia was choked and had her...,1
48869,breaking the latest poll shows that our specia...,1


In [129]:
authority_dataset.to_csv('../../code/authority/data/authority_dataset_corrected.csv', index = False)