In [None]:
# basic untilities
import pandas as pd
import numpy as np

# regex cleaning
import re

# progress bar
from tqdm.auto import tqdm

In [None]:
# Dataset avalible at:
# https://www.kaggle.com/datasets/komalkhetlani/tweets-about-covid19-all-over-the-world
df = pd.read_csv('TweetsAboutCovid-19.csv', low_memory=False, index_col='id')

In [None]:
# Drop columns not usful for the analysis.

# We will be looking at text content so the 'thumbnail' and 'video' columns will be removed

# Further we remove the point column since less than 1% of the entries carry a value.

# 'timezone' is always 0. There for the column does not carry information and will be removed.

# Like 'timezone', 'retweet' is always False and therefore unneccessary

# Lastly 'cashtags' only carries values for 1'424 Tweets. My analysis will not revolve around finances
# so this columns will also be removed

df = df.drop(columns=['video', 'thumbnail', 'place', 'timezone', 'cashtags', 'retweet'])

# We will also drop all columns with NA values
df = df.dropna()

In [None]:
# The columns created_at and date combined with time are also redundant.
# We will transform the created_at column to datetime objects and remove date and time
df = df.drop(columns=['date', 'time'])
df['created_at'] = pd.to_datetime(df['created_at'])

In [None]:
# Looking at the current DataFrame
df

In [None]:
# Next up we will transform the count columns replies, retweets and likes to integers instead of floats
df['likes_count'] = df['likes_count'].astype(np.int32)
df['replies_count'] = df['replies_count'].astype(np.int32)
df['retweets_count'] = df['retweets_count'].astype(np.int32)

In [None]:
# Next is the language selection.

# We will first look at the distribuions
language, counts = np.unique(df['language'], return_counts=True)

# Order the top 11 decreasing. These are all languages with more than 10.000 Tweets.
order = np.argsort(counts)[:-11:-1]

# print the language codes together with their counts

# en: English
# es: Spanish
# in: Indonesian
# pt: Portugise
# hi: Hindi
# fr: French
# de: German
# und: undecicive (will ignore)
# ja: Japanise
# tr: Turkish
print(language[order])
print(counts[order])

In [None]:
# Filter the array, so that only tweets of significantly big languages are present.
# This removes around 10% of data (80.043 Tweets).
df = df[df['language'].isin(['en', 'es', 'in', 'pt', 'hi', 'fr', 'de', 'ja', 'tr'])]

In [None]:
# Next up we will remove URL and replace @mentions with a generic @user.
# To accomplish this regex expressions are used.

def clean_text(text):
    
    # convert to lowercase
    text = ' ' + text.lower() + ' '
    
    # remove URLs
    text = re.sub(r' http[0-9a-z:/\-\.]{1,200} ', ' ', text)
    
    # replace @ mentions with generic user
    text = re.sub(r' \@[a-z0-9]{1, 400} ', ' ', text)
    
    # remove digits
    text = re.sub(r' [0-9]{1, 400} ', ' ', text)
    
    # remove whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    text = re.sub(r'[\-\:]', '', text)
    
    return text[1:-1]

In [None]:
clean_text('https://t.co/4rdhsh3iyl prime minister @naren 124234 covid-19: wow')

In [None]:
cleaned_tweets = []

for row in tqdm(df.iterrows(), total=len(df), desc='Cleaning tweets'):
    data = row[1]

    cleaned_tweets.append(clean_text(data['tweet']))

df.loc[:, 'cleaned_tweets'] = cleaned_tweets
df

In [None]:
# translating non-english tweets

In [None]:
# saving pre-processed data set
df.to_pickle('pre-processed-data.pkl')