In [3]:
import pandas as pd
import sys
from urllib.parse import urlparse
import re



############################################################
#helper functions for text cleaning
############################################################

#borrowed from jabryden's code
stop_words = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
                  "your", "yours", "yourself", "yourselves", "he", "him", "his", 
                  "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
                  "they", "them", "their", "theirs", "themselves", "what", "which", 
                  "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
                  "was", "were", "be", "been", "being", "have", "has", "had", "having",
                  "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
                  "or", "because", "as", "until", "while", "of", "at", "by", "for", 
                  "with", "about", "against", "between", "into", "through", "during",
                  "before", "after", "above", "below", "to", "from", "up", "down", "in", 
                  "out", "on", "off", "over", "under", "again", "further", "then", "once",
                  "here", "there", "when", "where", "why", "how", "all", "any", "both",
                  "each", "few", "more", "most", "other", "some", "such", "no", "nor",
                  "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
                  "can", "will", "just", "don", "should", "now", 'rt'])

punct = """!"#$%&'()*+, -./:;<=>?[\]^_`{|}~??£??????"""

exclusions = lambda x : [(x in stop_words), #drop stopwords
                         (x[0].isnumeric())] #drop numbers

deEmojify_pattern = re.compile(pattern = "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       "]+", flags = re.UNICODE)

def clean_text(text):
    text = deEmojify_pattern.sub(r'',text) #remove emojis, etc.
    text = re.sub(r'#\w+', r'', text) #remove hashtags
    text = re.sub(r'@\w+', r'', text) #remove username mentions
    text = re.sub(r'(RT :)', r'', text) #remove RTs
    text = re.sub(r'http\S+', '', text) #remove URLs
#     text = re.sub(r'-', r' -', text) #split hyphenated words

    words = [word.strip(punct).lower() for word in text.split()]
    words = [word for word in words if (word!='') and (not any(exclusions(word)))]
    if len(words)==0 or words==['']: return None
    text = ' '.join(words)
    
    return text

############################################################
# load data
############################################################



print('loading data')
#load metadata files
# data_path = '/geode2/home/u040/jmbollen/Carbonate/parliament/data/'
data_path = './data/'
commons_speeches = pd.read_parquet(data_path+'commons_speeches.parquet')
mp_tweets = pd.read_parquet(data_path+'mp_tweets.parquet')
urls = pd.read_parquet(data_path+'urls.parquet')
urls['domain'] = urls['clean_url'].apply(lambda x: (urlparse(x).netloc).replace('www.',''))

mp_tweets['text_type']='tweet'
urls['text_type']='url'
commons_speeches['text_type']='commons_speech'

commons_speeches = commons_speeches.rename(columns={'commons_speech_id':'text_id'})
urls = urls.rename(columns={'url_rid':'text_id'})
mp_tweets = mp_tweets.rename(columns={'tweet_id':'text_id'})

all_texts = pd.concat([mp_tweets,commons_speeches,urls])

del commons_speeches; del urls; del mp_tweets


print('cleaning texts')
all_texts['text'] = all_texts['text'].apply(clean_text)

all_texts = all_texts.dropna(subset=['text']
                    ).drop_duplicates(subset=['text_id']
                    ).reset_index(drop=True)


loading data
cleaning texts


In [4]:
all_texts.head()

Unnamed: 0,mp_name,text_id,time,text,user_id,n_likes,n_replies,n_retweets,n_quotes,party,...,sum_shares,sum_likes,sum_loves,sum_hahas,sum_wows,sum_sorrys,sum_angers,sum_comments,sum_share_without_clicks,domain
0,gary streeter,881832084603633664,2017-07-03 11:08:47,compelling case pay increase nurses made conse...,4785228995,0.0,0.0,13.0,0.0,Con,...,,,,,,,,,,
1,gary streeter,881845410364628993,2017-07-03 12:01:44,glad see selected best ever photo could auditi...,4785228995,5.0,5.0,2.0,2.0,Con,...,,,,,,,,,,
2,gary streeter,881925006112018433,2017-07-03 17:18:01,assured intruders removed asap excuse breaking...,4785228995,4.0,0.0,0.0,1.0,Con,...,,,,,,,,,,
3,gary streeter,881925078564392961,2017-07-03 17:18:18,need help us create list buildings places tell...,4785228995,0.0,0.0,260.0,0.0,Con,...,,,,,,,,,,
4,gary streeter,881946858318422018,2017-07-03 18:44:51,proud species champion horrid ground weaver sp...,4785228995,32.0,4.0,10.0,3.0,Con,...,,,,,,,,,,


In [5]:
all_texts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3481563 entries, 0 to 3481562
Data columns (total 27 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   mp_name                   object        
 1   text_id                   object        
 2   time                      datetime64[ns]
 3   text                      object        
 4   user_id                   object        
 5   n_likes                   float64       
 6   n_replies                 float64       
 7   n_retweets                float64       
 8   n_quotes                  float64       
 9   party                     object        
 10  text_type                 object        
 11  constituency              object        
 12  job                       object        
 13  clean_url                 object        
 14  count                     float64       
 15  sum_views                 float64       
 16  sum_clicks                float64       
 17  sum_shar

In [6]:
all_texts.to_parquet(data_path + 'all_texts_cleaned.parquet')