In [1]:
# basic untilities
import pandas as pd
import numpy as np

# regex cleaning
import re

# translation
import json
import sys
from urllib import request, parse

# progress bar
from tqdm.auto import tqdm

In [2]:
# Dataset avalible at:
# https://www.kaggle.com/datasets/komalkhetlani/tweets-about-covid19-all-over-the-world
df = pd.read_csv('TweetsAboutCovid-19.csv', low_memory=False, index_col='id')

In [3]:
# Drop columns not usful for the analysis.

# We will be looking at text content so the 'thumbnail' and 'video' columns will be removed

# Further we remove the point column since less than 1% of the entries carry a value.

# 'timezone' is always 0. There for the column does not carry information and will be removed.

# Like 'timezone', 'retweet' is always False and therefore unneccessary

# Lastly 'cashtags' only carries values for 1'424 Tweets. My analysis will not revolve around finances
# so this columns will also be removed

df = df.drop(columns=['video', 'thumbnail', 'place', 'timezone', 'cashtags', 'retweet'])

# We will also drop all columns with NA values
df = df.dropna()

In [4]:
# The columns created_at and date combined with time are also redundant.
# We will transform the created_at column to datetime objects and remove date and time
df = df.drop(columns=['date', 'time'])
df['created_at'] = pd.to_datetime(df['created_at'])

In [5]:
# Looking at the current DataFrame
df

Unnamed: 0_level_0,created_at,tweet,language,replies_count,retweets_count,likes_count,hashtags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.385880e+18,2021-04-24 08:43:17+00:00,🇨🇺: ✍️ Covid-19 en Cuba: 1241 nuevos casos pos...,es,0.0,0.0,0.0,"['reportando', 'cuba']"
1.385880e+18,2021-04-24 08:43:17+00:00,The latest The Zika Advice Paper! https://t.c...,en,0.0,0.0,0.0,"['covid19', 'amr']"
1.385880e+18,2021-04-24 08:43:16+00:00,Tum karo toh mantra ..woh kare toh tantra ..ai...,tl,0.0,0.0,0.0,"['covidindia', 'covidvaccine', 'covidresources..."
1.385880e+18,2021-04-24 08:43:16+00:00,https://t.co/4rdhSH3IYl Prime Minister @Nare...,en,0.0,0.0,0.0,['covid_19']
1.385880e+18,2021-04-24 08:43:16+00:00,@bc_pt64 @KackCake @sherlockine1 @SternchenJvB...,de,0.0,0.0,0.0,[]
...,...,...,...,...,...,...,...
1.385530e+18,2021-04-23 10:01:36+00:00,@narendramodi @RahulGandhi Me aapke Gujrat se ...,hi,0.0,0.0,0.0,[]
1.385530e+18,2021-04-23 10:01:36+00:00,@vester71 Al die bedden op de gang ! Mensen di...,nl,1.0,0.0,2.0,"['corona', 'cynischetweet']"
1.385530e+18,2021-04-23 10:01:36+00:00,@Parames03367508 Dai mada koo🔥yane.. ithella e...,in,1.0,0.0,0.0,[]
1.385530e+18,2021-04-23 10:01:36+00:00,@aliabdollahzade @mohammad_d_d_65 I don't like...,en,0.0,0.0,0.0,[]


In [6]:
# Next up we will transform the count columns replies, retweets and likes to integers instead of floats
df['likes_count'] = df['likes_count'].astype(np.int32)
df['replies_count'] = df['replies_count'].astype(np.int32)
df['retweets_count'] = df['retweets_count'].astype(np.int32)

In [7]:
# Next is the language selection.

# We will first look at the distribuions
language, counts = np.unique(df['language'], return_counts=True)

# Order the top 11 decreasing. These are all languages with more than 10.000 Tweets.
order = np.argsort(counts)[:-11:-1]

# print the language codes together with their counts

# en: English
# es: Spanish
# in: Indonesian
# pt: Portugise
# hi: Hindi
# fr: French
# de: German
# und: undecicive (will ignore)
# ja: Japanise
# tr: Turkish
print(language[order])
print(counts[order])

['en' 'es' 'in' 'pt' 'hi' 'fr' 'de' 'und' 'ja' 'tr']
[412115 131519  40841  33253  28140  26546  22139  19622  17672  11376]


In [8]:
# Filter the array, so that only tweets of significantly big languages are present.
# This removes around 10% of data (80.043 Tweets).
df = df[df['language'].isin(['en', 'es', 'fr', 'de'])]

In [9]:
# Next up we will remove URL and replace @mentions with a generic @user.
# To accomplish this regex expressions are used.

def clean_text(text):
    
    # convert to lowercase
    text = ' ' + text.lower() + ' '
    
    # remove URLs
    text = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text)
    
    # replace @ mentions and hashtags
    text = re.sub(r'[@#]\w+', '', text)
    
    # remove digits
    text = re.sub(r'\d+', '', text)
    
    # remove special characters
    text = re.sub(r'[\-\:()]', '', text)
    
    # remove emojis
    text = re.sub(r'[\U0001F300-\U0001F5FF|\U0001F1E6-\U0001F1FF|\U00002700-\U000027BF|\U0001F900-\U0001F9FF|\U0001F600-\U0001F64F|\U0001F680-\U0001F6FF|\U00002600-\U000026FF]', '', text)
    
    # remove whitespaces
    text = re.sub(r'\s+', ' ', text)
    
    return text[1:-1]

In [10]:
# Example
clean_text('https://some-website.com/tweet-test.html ThIs #wow#dfs is a TEST for my project at @aalto University 2022! Woho 😊👍')

'this is a test for my project at university ! woho'

In [11]:
# Applying the cleaning funciton to all tweets. Takes ~30sec, 25k iterations per second
cleaned_tweets = []
content_left = []

for row in tqdm(df.copy().iterrows(), total=len(df), desc='Cleaning tweets'):
    data = row[1]

    cleaned = clean_text(data['tweet'])
    
    # create a filter to discard empty tweets after cleaning
    if cleaned == '':
        content_left.append(False)
    else:
        content_left.append(True)
    
    cleaned_tweets.append(cleaned)

df['cleaned_tweets'] = cleaned_tweets

print('Number of empty tweets after cleaning:', len(df) - sum(content_left))

# remove empty tweets
df = df[content_left]

df

Cleaning tweets:   0%|          | 0/592319 [00:00<?, ?it/s]

Number of empty tweets after cleaning: 42


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'] = cleaned_tweets


Unnamed: 0_level_0,created_at,tweet,language,replies_count,retweets_count,likes_count,hashtags,cleaned_tweets
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.385880e+18,2021-04-24 08:43:17+00:00,🇨🇺: ✍️ Covid-19 en Cuba: 1241 nuevos casos pos...,es,0,0,0,"['reportando', 'cuba']",️ covid en cuba nuevos casos positivos y falle...
1.385880e+18,2021-04-24 08:43:17+00:00,The latest The Zika Advice Paper! https://t.c...,en,0,0,0,"['covid19', 'amr']",the latest the zika advice paper! thanks to
1.385880e+18,2021-04-24 08:43:16+00:00,https://t.co/4rdhSH3IYl Prime Minister @Nare...,en,0,0,0,['covid_19'],prime minister on saturday said that like last...
1.385880e+18,2021-04-24 08:43:16+00:00,@bc_pt64 @KackCake @sherlockine1 @SternchenJvB...,de,0,0,0,[],benutze doch keine wörter deren bedeutung du n...
1.385880e+18,2021-04-24 08:43:16+00:00,Covid-19: India is going through very terrible...,en,0,0,0,"['presssangharsh', 'dailynews', 'news', 'india...",covid india is going through very terrible sit...
...,...,...,...,...,...,...,...,...
1.385530e+18,2021-04-23 10:01:40+00:00,📍 #Beratungsangebot (26.April - 30. April): N...,de,0,3,3,"['beratungsangebot', 'mode', 'kulturberatung']",.april . april nächste woche bieten wir verans...
1.385530e+18,2021-04-23 10:01:39+00:00,@aka_jmk Geht mir genauso. Und so langsam komm...,de,0,0,1,[],geht mir genauso. und so langsam kommt auch wi...
1.385530e+18,2021-04-23 10:01:37+00:00,No Indian allowed to travel abroad. No worries...,en,5,1,65,[],no indian allowed to travel abroad. no worries...
1.385530e+18,2021-04-23 10:01:36+00:00,@aliabdollahzade @mohammad_d_d_65 I don't like...,en,0,0,0,[],"i don't like corona, bring me"


In [12]:
class LibreTranslateAPI:
    """Connect to the LibreTranslate API"""

    """Example usage:
    from libretranslatepy import LibreTranslateAPI

    lt = LibreTranslateAPI("https://translate.argosopentech.com/")

    print(lt.translate("LibreTranslate is awesome!", "en", "es"))
    # LibreTranslate es impresionante!

    print(lt.detect("Hello World"))
    # [{"confidence": 0.6, "language": "en"}]
    
    print(lt.languages())
    # [{"code":"en", "name":"English"}]
    """

    DEFAULT_URL = "http://127.0.0.1:5000/"

    def __init__(self, url="http://localhost:5000/", api_key=None):
        """Create a LibreTranslate API connection.

        Args:
            url (str): The url of the LibreTranslate endpoint.
            api_key (str): The API key.
        """
        self.url = LibreTranslateAPI.DEFAULT_URL if url is None else url
        self.api_key = api_key

        # Add trailing slash
        assert len(self.url) > 0
        if self.url[-1] != "/":
            self.url += "/"

    def translate(self, q, source, target="en"):
        """Translate string

        Args:
            q (str): The text to translate
            source (str): The source language code (ISO 639)
            target (str): The target language code (ISO 639)

        Returns:
            str: The translated text
        """
        url = self.url + "translate"
        
        params = {"q": q, "source": source, "target": target}
        
        url_params = parse.urlencode(params)
        
        req = request.Request(url, data=url_params.encode())
        response = request.urlopen(req)
        response_str = response.read().decode()
        
        return json.loads(response_str)["translatedText"]

In [13]:
translator = LibreTranslateAPI()

In [14]:
with open('translated_tweets_1.json', 'r') as f:
    translated_tweets = json.load(f)

In [15]:
temp = list(translated_tweets)

In [20]:
c = 1
work = False

for entry in tqdm(df.iloc[:len(df)//4].iterrows(), total=len(df)//4, desc='Translating to english'):
    
    if not work:
        if c == len(translated_tweets):
            work = True
        c += 1
    else:
    
        data = entry[1]
        if len(data['cleaned_tweets']) < 5:
            translated_tweets.append('')
            continue

        if data['language'] == 'en':
            translated_tweets.append(data['cleaned_tweets'])
            continue

        translated_tweets.append(translator.translate(data['cleaned_tweets'], data['language']))

        if len(translated_tweets)%20 == 0:
            with open('translated_tweets_1.json', 'w') as f:
                json.dump(translated_tweets, f)

Translating to english:   0%|          | 0/148069 [00:00<?, ?it/s]

In [21]:
with open('translated_tweets_1.json', 'w') as f:
    json.dump(translated_tweets, f)

In [25]:
# combine output from all workers
translations = []
for i in range(1, 5):
    with open('translated_tweets_'+ str(i) +'.json', 'r') as f:
        translations = translations + json.load(f)

In [57]:
# saving pre-processed data set
df['translation'] = translations
df.to_pickle('pre-processed-data.pkl')

In [59]:
def pre_processing(text):
    
    """
    A function to clean the tweet text
    """
    #Remove hyper links
    text = re.sub(r'https?:\/\/\S+', '', text)
    
    #Remove @mentions, which is specific to twitter posts
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # as tweets escape newlines with \n, we remove \n from string. We don't want to lose the meaning of the tweet
    text = re.sub(r'\n','', text) 

    # only keep letters and numbers
    text = re.sub(r"[^A-Za-z0-9#]+", ' ', text)
    
    #Remove extra spaces and the space at the end of the string
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [61]:
# final clean of translations
clean_translations = []
for entry in tqdm(df.iterrows(), total=len(df), desc='cleaning translations'):
    data = entry[1]
    clean_translations.append(pre_processing(data['translation']))

cleaning translations:   0%|          | 0/592277 [00:00<?, ?it/s]

In [62]:
df['translation'] = clean_translations

In [66]:
df = df[df['translation'].str.len() > 4]

In [67]:
df.to_pickle('pre-processed-data.pkl')