In [18]:
import pandas as pd
import re 

In [22]:
import spacy 
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

In [23]:

# Functions for removing contractions 
contraction_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}
def expand_contractions(data,contractions_dict = contraction_dict):
  '''
    Expanding Contractions
    Arguments:
      data: textual dataset 
      contractions_dict : dictionanary containing the contractions and their replacements 
    Returns :
      clean_data : textual dataset where contractions are expanded
  '''
  # Regular expression for finding contractions
  contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))
  def replace(match):
      return contractions_dict[match.group(0)]
  return contractions_re.sub(replace, data)

In [12]:
df_ad_analyst = pd.read_csv('../ad_analyst_dataset.csv')
df_ad_analyst

Unnamed: 0,ads
0,back to school a real opportunity register joi...
1,every day children across yemen and syria go t...
2,in our february issue mineral carbonation dam ...
3,become a specialist in energy renovation in bu...
4,if you think like joseph his story should spea...
...,...
6340,identity v coa iv naeu online qualifier is liv...
6341,moms everywhere are taking this simple step to...
6342,my medication wasnt covered by my insurance
6343,realm logistics pressure washing licensed insu...


In [4]:
df_counterpublics = pd.read_csv('../counterpublics_dataset.csv')
df_counterpublics 

Unnamed: 0,ads
0,we're stepping up our vital democratic engagem...
1,we know not everyone is in a position to give ...
2,our progress for canadians is powered by grass...
3,sign here if you think justin trudeau and his ...
4,instead of being focused on rebuilding our eco...
...,...
11108,an efficient mayor is not only a good manager ...
11109,unforgettable walks accessible to all\r\nembar...
11110,we're at 25000 followers and still growing str...
11111,as chief meteorologist at miami's nbc 6 john m...


In [5]:
df_propublica = pd.read_csv('../propublica_dataset.csv')
df_propublica

Unnamed: 0,ads
0,Access to safe water improves health and saves...
1,"Spring its just around the corner, and at Nort..."
2,Your exceptional career in teaching starts here.
3,As COVID-19 puts online shopping into overdriv...
4,Donate now to help bring relief to those suffe...
...,...
67666,Now online! Learn more about the complexities ...
67667,"AFSC is offering a global, interconnected resp..."
67668,It’s time to end Big Tobacco’s strategic killi...
67669,Public media stations are providing enhanced s...


In [6]:
# Merge all datasets
df_all = pd.concat([df_ad_analyst,df_counterpublics,df_propublica],ignore_index=True)
df_all

Unnamed: 0,ads
0,back to school a real opportunity register joi...
1,every day children across yemen and syria go t...
2,in our february issue mineral carbonation dam ...
3,become a specialist in energy renovation in bu...
4,if you think like joseph his story should spea...
...,...
85124,Now online! Learn more about the complexities ...
85125,"AFSC is offering a global, interconnected resp..."
85126,It’s time to end Big Tobacco’s strategic killi...
85127,Public media stations are providing enhanced s...


In [None]:
# Cleaning the data 
df_clean  = df_all.copy()

# Lower case ads 
df_clean['ad'] = df_clean['ad'].apply(lambda x: x.lower())

# Expanding Contractions
df_clean['ad'] = df_clean['ad'].apply(expand_contractions)

# Removing punctuations
df_clean['ad'] = df_clean['ad'].str.replace('[^\w\s]','')

# Removing numbers
df_clean['ad'] = df_clean['ad'].str.replace('\d+','')


# Removing extra space 
df_clean['ad'] = df_clean['ad'].str.replace('\s+', ' ')

# Lemmatization 
df_clean['lemmatized'] = df_clean['ad'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x) if word.is_alpha]))



In [None]:
df_clean.to_csv('../clean_dataset.csv',index=False)