## Preprocessing text data

In [7]:
import re
import html
import pandas as pd

from emot import EMOJI_UNICODE

In [8]:
btc_news = pd.read_parquet('../../1_data_acquisition/google_news/btc_news_data.parquet.gzip')
eth_news = pd.read_parquet('../../1_data_acquisition/google_news/eth_news_data.parquet.gzip')
reddit_r_bitcoin = pd.read_parquet('../../1_data_acquisition/reddit/reddit_r_bitcoin.parquet.gzip')
reddit_r_ethereum = pd.read_parquet('../../1_data_acquisition/reddit/reddit_r_ethereum.parquet.gzip')
btc_tweets = pd.read_parquet('../../1_data_acquisition/twitter/btc_tweets.parquet.gzip')
eth_tweets = pd.read_parquet('../../1_data_acquisition/twitter/eth_tweets.parquet.gzip')

### Add day, month and year for grouping

In [9]:
for df in [btc_news, eth_news]:
    df['day'] = [i.day for i in df.datetime]
    df['month'] = [i.month for i in df.datetime]
    df['year'] = [i.year for i in df.datetime]
    
for df in [reddit_r_bitcoin, reddit_r_ethereum]:
    df['day'] = [i.day for i in df.time]
    df['month'] = [i.month for i in df.time]
    df['year'] = [i.year for i in df.time]

### Remove deleted or empty reddit posts

In [10]:
reddit_r_bitcoin = reddit_r_bitcoin[~reddit_r_bitcoin.selftext.isin(['', '[deleted]', '[removed]'])]
reddit_r_ethereum = reddit_r_ethereum[~reddit_r_ethereum.selftext.isin(['', '[deleted]', '[removed]'])]

### Remove tweets regarding Ethiopia

...and particularly their performance in the Summer Olympics 2016 in Rio from the Ethereum tweet dataset

In [13]:
rm = {'Rio2016', 'ethiopia', 'Olympics'}
l = []
for i in eth_tweets.hashtags.values:
    try:
        l.append(set(i).isdisjoint(rm))
    except:
        l.append(True)

In [14]:
print(f'Removing {len(eth_tweets)-len(eth_tweets[l])} tweets.')
eth_tweets = eth_tweets[l]

Removing 193 tweets.


### Clean tweets and reddit posts for analysis

In [15]:
def remove_whitespace(string):
    ''' Function to remove tabs, line delimiters etc. from a string of text. '''
    return ' '.join(string.split())

def remove_repeated_chars(string):
    ''' Function to remove any instance of the same character being repeated
    more than 10 times. (e.g. lines to separate paragraphs)'''
    pattern = r'(.)\1{9,}'
    return re.sub(pattern, '', string)

def remove_urls(string):
    ''' Function to remove any kind of URLs from a string of text. '''
    pattern = r'\[?\(?(https?://\S+|www\.\S+|\w+\.\w+/\S+)\]?\)?'
    return re.sub(pattern, '', string)

def convert_emojis(string):
    ''' Function to convert emojis into a text that reflects their meaning. '''
    for i in EMOJI_UNICODE:
        replacement = i.translate(str.maketrans('', '', ':')).replace(r'_', r' ')
        string = string.replace(EMOJI_UNICODE[i], f'[{replacement} emoji]')
    return string

def get_raw_text_utf8(string):
    ''' Returns raw text of the input string in utf-8 encoding,
    i.e. containing all unicode elements in unaltered form. '''
    string = html.unescape(string) # unescape string to e.g. convert "&amp;" to "&" or "&#xB200" to " "
    string = remove_repeated_chars(string) # remove characters repeated more than 10 times
    string = remove_urls(string) # remove URLs
    return remove_whitespace(string) # return string without tabs and line delimiters

def get_raw_text_ascii(string):
    ''' Returns raw text of the input string in ASCII encoding,
    i.e. with all unicode elements removed. '''
    string = html.unescape(string) # unescape string to e.g. convert "&amp;" to "&" or "&#xB200" to " "
    string = remove_repeated_chars(string) # remove characters repeated more than 10 times
    string = remove_urls(string) # remove URLs
    string = string.encode('ascii', 'ignore').decode('utf-8') # remove all non-ASCII characters
    return remove_whitespace(string) # return string without tabs and line delimiters

def get_raw_text_ascii_emoji(string):
    ''' Returns raw text of the input string in ASCII encoding,
    i.e. without unicode elements, but with emojis converted to their
    textual description in the following form: "🔥" to "[fire emoji]". '''
    string = html.unescape(string) # unescape string to e.g. convert "&amp;" to "&" or "&#xB200" to " "
    string = remove_repeated_chars(string) # remove characters repeated more than 10 times
    string = remove_urls(string) # remove URLs
    string = convert_emojis(string) # convert emojis 
    string = string.encode('ascii', 'ignore').decode('utf-8') # remove all non-ASCII characters
    return remove_whitespace(string) # return string without tabs and line delimiters

In [16]:
reddit_r_bitcoin['selftext_cleaned'] = reddit_r_bitcoin.selftext.apply(get_raw_text_utf8)
reddit_r_ethereum['selftext_cleaned'] = reddit_r_ethereum.selftext.apply(get_raw_text_utf8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_r_ethereum['selftext_cleaned'] = reddit_r_ethereum.selftext.apply(get_raw_text_utf8)


In [17]:
btc_tweets['content_cleaned'] = btc_tweets.content.apply(get_raw_text_utf8)
eth_tweets['content_cleaned'] = eth_tweets.content.apply(get_raw_text_utf8)

### Remove empty reddit posts (again)

Again, since some posts may have consisted only of an URL (e.g. picture posts), which has now been removed.

In [18]:
reddit_r_bitcoin = reddit_r_bitcoin[reddit_r_bitcoin.selftext_cleaned.str.len() > 2]
reddit_r_ethereum = reddit_r_ethereum[reddit_r_ethereum.selftext_cleaned.str.len() > 2]

### Combine title and body of reddits posts

In [19]:
for df in [reddit_r_bitcoin, reddit_r_ethereum]:
    df['content'] = df.title + ' | ' + df.selftext_cleaned

### Remove unnecessary columns

In [20]:
for df in [btc_news, eth_news]:
    df.drop(columns=[
        'url',
        'datetime'],
        inplace=True)

for df in [reddit_r_bitcoin, reddit_r_ethereum]:
    df.drop(columns=[
        'url',
        'title',
        'selftext',
        'selftext_cleaned',
        'utc_datetime_str',
        'author',
        'time'],
        inplace=True)

for df in [btc_tweets, eth_tweets]:
    df.drop(columns=[
        'datetime',
        'retweeted_tweet',
        'hashtags',
        'content',
        'username',
        'user_displayname',
        'user_description',
        'user_verified'],
        inplace=True)

### Drop duplicates

In [21]:
reddit_r_bitcoin = reddit_r_bitcoin.drop_duplicates(subset=['content', 'year', 'month', 'day'])
reddit_r_ethereum = reddit_r_ethereum.drop_duplicates(subset=['content', 'year', 'month', 'day'])
btc_tweets = btc_tweets.drop_duplicates(subset=['content_cleaned', 'year', 'month', 'day'])
eth_tweets = eth_tweets.drop_duplicates(subset=['content_cleaned', 'year', 'month', 'day'])

### Add / rename unique identifier

In [22]:
btc_news = btc_news.reset_index()
btc_news.index.names = ['news_id']
eth_news = eth_news.reset_index()
eth_news.index.names = ['news_id']

In [23]:
reddit_r_bitcoin = reddit_r_bitcoin.reset_index()
reddit_r_bitcoin.index.names = ['post_id']
reddit_r_ethereum = reddit_r_ethereum.reset_index()
reddit_r_ethereum.index.names = ['post_id']

In [24]:
btc_tweets = btc_tweets.rename(columns={'tweet ID': 'tweet_id'})
eth_tweets = eth_tweets.rename(columns={'tweet ID': 'tweet_id'})

### Save to parquet

In [25]:
btc_news.to_parquet('btc_news_processed.parquet.gzip', compression='gzip')
eth_news.to_parquet('eth_news_processed.parquet.gzip', compression='gzip')
reddit_r_bitcoin.to_parquet('reddit_r_bitcoin_processed.parquet.gzip', compression='gzip')
reddit_r_ethereum.to_parquet('reddit_r_ethereum_processed.parquet.gzip', compression='gzip')
btc_tweets.to_parquet('btc_tweets_processed.parquet.gzip', compression='gzip')
eth_tweets.to_parquet('eth_tweets_processed.parquet.gzip', compression='gzip')