In [None]:
# basic untilities
import pandas as pd
import numpy as np

# regex cleaning
import re

# progress bar
from tqdm.auto import tqdm

In [None]:
# Dataset avalible at:
# https://www.kaggle.com/datasets/komalkhetlani/tweets-about-covid19-all-over-the-world
df = pd.read_csv('TweetsAboutCovid-19.csv', low_memory=False, index_col='id')

In [None]:
# Drop columns not usful for the analysis.

# We will be looking at text content so the 'thumbnail' and 'video' columns will be removed

# Further we remove the point column since less than 1% of the entries carry a value.

# 'timezone' is always 0. There for the column does not carry information and will be removed.

# Like 'timezone', 'retweet' is always False and therefore unneccessary

# Lastly 'cashtags' only carries values for 1'424 Tweets. My analysis will not revolve around finances
# so this columns will also be removed

df = df.drop(columns=['video', 'thumbnail', 'place', 'timezone', 'cashtags', 'retweet'])

# We will also drop all columns with NA values
df = df.dropna()

In [None]:
# The columns created_at and date combined with time are also redundant.
# We will transform the created_at column to datetime objects and remove date and time
df = df.drop(columns=['date', 'time'])
df['created_at'] = pd.to_datetime(df['created_at'])

In [None]:
# Looking at the current DataFrame
df

In [None]:
# Next up we will transform the count columns replies, retweets and likes to integers instead of floats
df['likes_count'] = df['likes_count'].astype(np.int32)
df['replies_count'] = df['replies_count'].astype(np.int32)
df['retweets_count'] = df['retweets_count'].astype(np.int32)

In [None]:
# Next is the language selection.

# We will first look at the distribuions
language, counts = np.unique(df['language'], return_counts=True)

# Order the top 11 decreasing. These are all languages with more than 10.000 Tweets.
order = np.argsort(counts)[:-11:-1]

# print the language codes together with their counts

# en: English
# es: Spanish
# in: Indonesian
# pt: Portugise
# hi: Hindi
# fr: French
# de: German
# und: undecicive (will ignore)
# ja: Japanise
# tr: Turkish
print(language[order])
print(counts[order])

In [None]:
# Filter the array, so that only tweets of significantly big languages are present.
# This removes around 10% of data (80.043 Tweets).
df = df[df['language'].isin(['en', 'es', 'fr', 'de'])]

In [None]:
# Next up we will remove URL and replace @mentions with a generic @user.
# To accomplish this regex expressions are used.

def clean_text(text):
    
    # convert to lowercase
    text = ' ' + text.lower() + ' '
    
    # remove URLs
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    
    # replace @ mentions and hashtags
    text = re.sub(r' [@#]\w+', '', text)
    
    # remove digits
    text = re.sub(r' \d+', '', text)
    
    # remove special characters
    text = re.sub(r'[\-\:]', '', text)
    
    # remove emojis
    text = re.sub(r'[\U0001F300-\U0001F5FF|\U0001F1E6-\U0001F1FF|\U00002700-\U000027BF|\U0001F900-\U0001F9FF|\U0001F600-\U0001F64F|\U0001F680-\U0001F6FF|\U00002600-\U000026FF]', '', text)
    
    # remove whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    return text[1:-1]

In [None]:
# Example
clean_text('https://some-website.com/tweet-test.html ThIs is a TEST for my project at @aalto University 2022! Woho 😊👍')

In [None]:
# Applying the cleaning funciton to all tweets. Takes ~30sec
cleaned_tweets = []
content_left = []

for row in tqdm(df.copy().iterrows(), total=len(df), desc='Cleaning tweets'):
    data = row[1]

    cleaned = clean_text(data['tweet'])
    
    # create a filter to discard empty tweets after cleaning
    if cleaned == '':
        content_left.append(False)
    else:
        content_left.append(True)
    
    cleaned_tweets.append(cleaned)

df['cleaned_tweets'] = cleaned_tweets

print('Number of empty tweets after cleaning:', len(df) - sum(content_left))

# remove empty tweets
df = df[content_left]

df

In [None]:
# translating non-english tweets
sum_of_characters = 0
for t in df[df['language'] != 'en']['cleaned_tweets']:
    sum_of_characters += len(t)
print('Total number of characters in all non-english tweets:', sum_of_characters)

In [None]:
# for now remove non english
df = df[df['language'] == 'en']

In [None]:
df

In [None]:
# saving pre-processed data set
df.to_pickle('pre-processed-data.pkl')