# Parsing tweets

In [None]:
# import relevant packages
import csv
import pandas as pd
import numpy as np
import os
import re
import datetime
import emoji

## Preparing data for further processing

This notebook has two main goals:

1. **Tweet text:** We extract the tweet text and some other basic information, such as the user tweeting, the date, and whether the tweet is an original or a retweet. This information is be saved in a tabular format (here: a pandas dataframe), so make further preprocessing steps easier.
2. **Network edgelist:** We retrieve an edgelist of communicative interactions (retweets, quotes, mentions, replies). This information is also be saved in tabular format (here: a pandas dataframe) in order to allow for network analysis with ``networkx``.

### Loading data
Loading the data from the three different dataset files we've previously created and removing duplicate tweets. Each tweets comes with a unique tweet ID and we can use this information to find and remove duplicates.

In [None]:
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

In [None]:
# reading in the tweet objects we've stored in a file before
# we define a function that we can use for all three datasets

def loading(file_tweets, file_no_rt, file_only_rt):
    
    """
    Returns set with tweet ID's and unique tweet objects in list.
    
        Input: Three file names as strings in the following order: 
                1. File containing mix of original and retweets.
                2. File containing just original tweets.
                3. File containing just retweets.

        Output: List containing all unique tweets as a dict of dicts.
    """
    
    # empty set to count the total number of tweets collected (incl. duplicates)
    count = []

    # empty set to filter out duplicate tweets 
    ids = set()

    # empty list to store the tweets (as dict of dicts) in
    tweets = []

    # open the file in the 'read' mode, specify encoding as 'utf8'
    with open(file_tweets, 'r', encoding='utf8') as infile_tweets:
        with open(file_no_rt, 'r', encoding='utf8') as infile_no_rt:
            with open(file_only_rt, 'r', encoding='utf8') as infile_only_rt:

                # csv reader
                reader_tweets = csv.reader(infile_tweets)
                reader_no_rt = csv.reader(infile_no_rt)
                reader_only_rt = csv.reader(infile_only_rt)

                # iterate through the reader object
                for row in reader_tweets:

                    # each row is a list in which the first element contains the tweet object as a string resembling a dict
                    # we want to turn this string back into a dict we use eval() to do so

                    tweet = eval(row[0]) 
                    
                    # collect the id of all tweets (incl. duplicate ids)
                    count.append(row[0])
                    
                    # restrict the timeframe
                    if 'retweeted_status' in tweet:
                        date = datetime.datetime.strptime(tweet['retweeted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                        
                    elif 'quoted_status' in tweet: 
                        date = datetime.datetime.strptime(tweet['quoted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    else:
                        date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    if (date > datetime.datetime(2021, 5, 21, 7, 0, 0)) & (date < datetime.datetime(2021, 5, 28, 7, 0, 0)):  
                        
                        # if the tweet id is not yet in the ids set, then add id to the 'ids' set 
                        # and add the entire dict to the 'tweets' list
                        if tweet['id_str'] not in ids:
                            ids.add(tweet['id_str'])
                            tweets.append(tweet)

                for row in reader_no_rt:

                    # each row is a list in which the first element contains the tweet object as a string resembling a dict
                    # we want to turn this string back into a dict we use eval() to do so

                    tweet = eval(row[0])
                    
                    # restrict the timeframe
                    if 'retweeted_status' in tweet:
                        date = datetime.datetime.strptime(tweet['retweeted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                        
                    elif 'quoted_status' in tweet: 
                        date = datetime.datetime.strptime(tweet['quoted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    else:
                        date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    if (date > datetime.datetime(2021, 5, 21, 7, 0, 0)) & (date < datetime.datetime(2021, 5, 28, 7, 0, 0)):  
                        
                        # if the tweet id is not yet in the ids set, then add id to the 'ids' set 
                        # and add the entire dict to the 'tweets' list
                        if tweet['id_str'] not in ids:
                            ids.add(tweet['id_str'])
                            tweets.append(tweet)

                for row in reader_only_rt:

                    # each row is a list in which the first element contains the tweet object as a string resembling a dict
                    # we want to turn this string back into a dict we use eval() to do so

                    tweet = eval(row[0])
                    
                    # restrict the timeframe
                    if 'retweeted_status' in tweet:
                        date = datetime.datetime.strptime(tweet['retweeted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                        
                    elif 'quoted_status' in tweet: 
                        date = datetime.datetime.strptime(tweet['quoted_status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    else:
                        date = datetime.datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    
                    if (date > datetime.datetime(2021, 5, 21, 7, 0, 0)) & (date < datetime.datetime(2021, 5, 28, 7, 0, 0)):  
                        
                        # if the tweet id is not yet in the ids set, then add id to the 'ids' set 
                        # and add the entire dict to the 'tweets' list
                        if tweet['id_str'] not in ids:
                            ids.add(tweet['id_str'])
                            tweets.append(tweet)        
    
    return tweets, count

In [None]:
# applying the function to the different datasets

# German data
de_tweets_unique, de_count = loading(r'final_raw_data\de_tweets2.csv', r'final_raw_data\de_tweets_no_rt2.csv', r'final_raw_data\de_tweets_only_rt2.csv')

# Danish data
da_tweets_unique, da_count = loading(r'final_raw_data\tweets_da2.csv', r'final_raw_data\tweets_no_rt_da2.csv', r'final_raw_data\tweets_only_rt_da2.csv')

# Polish data
pl_tweets_unique, pl_count = loading(r'final_raw_data\tweets_pl_final3.csv', r'final_raw_data\tweets_pl_final_no_rt3.csv', r'final_raw_data\tweets_pl_final_rt3.csv')

In [None]:
### Count how many duplicates there are
de_pct_duplicates = (len(de_count) - len(de_tweets_unique)) / len(de_count)
da_pct_duplicates = (len(da_count) - len(da_tweets_unique)) / len(da_count)
pl_pct_duplicates = (len(pl_count) - len(pl_tweets_unique)) / len(pl_count)

# print the results
print(f'German data: {round(de_pct_duplicates,3)*100}% of the tweets were unique.')
print(f'Danish data: {round(da_pct_duplicates,3)*100}% of the tweets were unique.')
print(f'Polish data: {round(pl_pct_duplicates,3)*100}% of the tweets were unique.')

### Filtering out irrelevant tweets

In [None]:
# function to filter out all irrelevant tweets

def filter_irrelevant(list_of_tweet_dicts, irrelevant_ids):
    
    """
    Returns a list of relevant tweets. 
    
        Input: The list of unique tweets which we created in the cell before. A list of tweet IDs that
        should be removed because they are irrelevant.

        Output: List containing all relevant, unique tweets as a dict of dicts.
    """
    
    # list to store the filtered tweets in
    filtered_tweets = []
    
    # iterate through the list containing tweet objects (as dict of dicts)
    for tweet in list_of_tweet_dicts:
        
        # now we check if any unwanted tweets IDs are there 
        
        # we only keep tweets in which all possible tweet IDs which we can find in a tweet object 
        # (= ID of the tweet in question, IDs of possible retweets or quotes) are relevant
        
        # if it's a retweet
        if 'retweeted_status' in tweet:
            id_tx = tweet['id_str']
            id_rt = tweet['retweeted_status']['id_str']
            
            # make sure none of the IDs is in the 'irrelevant_ids' set
            if id_tx not in irrelevant_ids:
                if id_rt not in irrelevant_ids:
                    
                    # add to the list
                    filtered_tweets.append(tweet)

        # if it's quote
        elif 'quoted_status' in tweet:
            id_tx = tweet['id_str']
            id_qt = tweet['quoted_status']['id_str']
            
            # make sure none of the IDs is in the 'irrelevant_ids' set
            if id_tx not in irrelevant_ids:
                if id_qt not in irrelevant_ids:
                    
                    # add to the list
                    filtered_tweets.append(tweet)
        
        # if it's just a plain, original tweet
        else:
            id_tx = tweet['id_str']
            
             # make sure none of the IDs is in the 'irrelevant_ids' set
            if id_tx not in irrelevant_ids:
                
                # add to the list
                filtered_tweets.append(tweet)
        
    return filtered_tweets

#### Loading irrelevant tweet IDs

We've manually gone through keywords which were problematic in the data collection process (such as the use of the keyword 'j&j' since Twitter apparently treats the ampersand symbol as and ``AND`` operator). We've noted the IDs of all irrelevant tweets and remove these tweets here.

In [None]:
# Germany: loading irrelevant tweets and saving them in a set
de_irr = set()

with open(r'sanity_check\remove_de.txt', 'r') as de_file:
    de = de_file.readlines()
    
    for item in de: 
        item = re.findall('\d+', item)[0]
        de_irr.add(item)

print(len(de_irr))
de_irr

In [None]:
# Denmark: loading irrelevant tweets and saving them in a set
da_irr = set()

with open(r'sanity_check\remove_da.txt', 'r') as da_file:
    with open(r'sanity_check\remove_da2.txt', 'r') as da_file2:
         with open(r'sanity_check\remove_da3.txt', 'r') as da_file3:
        
            da = da_file.readlines()
            da2 = da_file2.readlines()
            da3 = da_file3.readlines()

            for item in da: 
                item = re.findall('\d+', item)[0]
                da_irr.add(item)

            for item in da2: 
                item = re.findall('\d+', item)[0]
                da_irr.add(item)
                
            for item in da3: 
                item = re.findall('\d+', item)[0]
                da_irr.add(item)

print(len(da_irr))
da_irr

In [None]:
# Poland: loading irrelevant tweets and saving them in a set
pl_irr = set()

with open(r'sanity_check\remove_pl.txt', 'r') as pl_file:
    with open(r'sanity_check\remove_pl2.txt', 'r') as pl_file2:
        with open(r'sanity_check\remove_pl3.txt', 'r') as pl_file3:
            pl = pl_file.readlines()
            pl2 = pl_file2.readlines()
            pl3 = pl_file3.readlines()

            for item in pl: 
                item = re.findall('\d+', item)[0]
                pl_irr.add(item)

            for item in pl2: 
                item = re.findall('\d+', item)[0]
                pl_irr.add(item)

            for item in pl3: 
                item = re.findall('\d+', item)[0]
                pl_irr.add(item)
                
print(len(pl_irr))
pl_irr

In [None]:
# applying the function
de_relevant = filter_irrelevant(de_tweets_unique, de_irr)
da_relevant = filter_irrelevant(da_tweets_unique, da_irr)
pl_relevant = filter_irrelevant(pl_tweets_unique, pl_irr)

### Filtering out all tweets that produce empty strings after preprocessing

We use a modified version of the preprocessing function that is later on used to conduct the actual preprocessing.

In this function, we conduct the following preprocessing steps:

* Remove **URLs**.
* Remove **emojis** using the ``emoji`` package.
* **Remove the mentions.**
* Remove ``&amp;`` (the HTML code for the **ampersand** symbol)
* **Replace '-' by an empty string:** This is important to keep words together that belong together. E.g. the German 'Impf-Reihenfolge' should be merge into one word 'ImpfReihenfolge' in order to not distort it's meaning. This makes lemmatization more difficult esp. because 'ImpfReihenfolge' is not the correct spelling of the word (it should be 'Impfreihenfolge'), but since this will only affect a very small number of words, we deem it acceptable.
* **Remove ':', '\*' and *_*:** Again, this is mostly relevant for German. In German, nouns describing people (e.g. the word for 'doctor') usually come in a male ('Doktor') and female form ('Doktorin'). In recent years, there has been a movement to include both spelling as either 'Doktor_in', 'Doktor:in' or 'Doktor\*in' in an attempt at more gender neutral language. If we replace these symbols by spaces, then we would distort the words meaning since 'Doktor in' is not the same as 'Doktorin'. We therefore just remove these symbols. This is not relevant for Polish or Danish, but since removing these symbols does not cause any other issue there, we do this for all three datasets.
* Only **keep the remaining alphanumeric characters** (incl. ``#`` for hashtags).
* Remove **numbers**.
* Remove **single characters.** They are usually not particularly meaningful: In Polish and German, there are no (meaningful) words that only consist of one character. In Danish, there is the 'I' (the plural 'you'; as in 'Hvordan har I det?'). But this character will be removed in the stopword list anyway, so we might as well already remove it here.
* Remove **double, triple etc. whitespaces**.
* Remove **leading and trailing whitespaces**.

In [None]:
# preprocess: remove @mentions

def preprocess_without_mentions(text):
    
    # remove URLs
    text = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                  ' ', text)
    
    # remove emojis: we use the 'emoji' package to do so
    # the function .get_emoji_regexp() returns a regex pattern for all unicode emoji characters
    # we use this pattern to match emojis and then replace them with a whitespace
    text = re.sub(emoji.get_emoji_regexp(), ' ', text)
    
    # remove @mentions
    text = re.sub(r'@\w+ ', ' ', text)
    
    # replace all '&amp;' (the HTML code for the ampersand symbol) by &
    text = re.sub('&amp;', '', text)
   
    # replace '-' by an empty string
    text = re.sub('-', '', text)
   
    # replace '_' by an empty string
    text = re.sub('_', '', text)
 
    # replace '*' by an empty string
    text = re.sub('\*', '', text)
 
    # replace ':' by an empty string
    text = re.sub(':', '', text)
 
    # keep all alphanumeric characters (i.e. [a-zA-Z0-9_])
    # that removes all weird/funny characters
    text = ' '.join(re.findall(r'[\w#]+', text))
 
    # remove numbers; note: this will remove the '19' in Covid19, but we do not see this as an issue
    text = re.sub('\d+', ' ', text)
 
    # remove single characters (because they are not particularly meaningful)
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)
 
    # remove whitespace
    text = re.sub(r'\s+', ' ', text)
 
    # remove leading and trailing whitespace
    text= text.strip()
 
    return text

In [None]:
# function to remove tweets that are empty after preprocessing

def remove_empty_tweets(list_of_filtered_tweets):
    
    # list to store the final tweets in
    final_tweets = []
    
    for tweet in list_of_filtered_tweets:
        
        # first we deal with retweets and original tweets (because they only have one tweet text in the end)
        
        # if it's a retweet
        if 'retweeted_status' in tweet:
            text = tweet['retweeted_status']['full_text']
            
            # apply the preprocessing function
            text = preprocess_without_mentions(text)
            
            # if the string is not empty, add to the final_tweets list
            if text:
                final_tweets.append(tweet)
        
        # if it's quote
        elif 'quoted_status' in tweet:
            text = tweet['full_text']
            quote = tweet['quoted_status']['full_text']
            
            # apply the preprocessing function
            text = preprocess_without_mentions(text)
            quote = preprocess_without_mentions(quote)
            
            # if the string is not empty, add to the final_tweets list
            if text:
                if quote:
                    final_tweets.append(tweet)
        
        # if it's just a plain, original tweet
        else:
            text = tweet['full_text']
            
            # apply the preprocessing function
            text = preprocess_without_mentions(text)
            
            # if the string is not empty, add to the final_tweets list
            if text:
                final_tweets.append(tweet)        
 
    return final_tweets

In [None]:
# applying the function
de_final = remove_empty_tweets(de_relevant)
da_final = remove_empty_tweets(da_relevant)
pl_final = remove_empty_tweets(pl_relevant)

## 1. Tweet text

### Retrieving the tweet text
Here we extract the tweet text together with some meta-data on the tweet ID, the user and the date it was created. In particular, we create two dataframes: The first one contains information on all tweets (original and retweet). The second dataframe contains information on all original tweets.

There are two tricky parts that need to be addressed:

1. **Retweets:** We create the ``original`` dataframe that only contains original tweet texts (this will be used for the topic modelling and the active learninig classification (ASDS2) and the semantic network (DM)). For this dataframe, we discard all retweets. However, we still want to retrieve the original text that was retweeted since that constitutes an original tweet. This information is hidden in the tweet object (in a dictionary called ``retweeted_status``) and we can retrieve all relevant information, but we have to add another check to make sure that there are no duplicates. The duplicate check we ran in the cells above was only concerned with the tweet ID of the overall tweet object, but it did not remove duplicates in the retweet ID that can be found in the ``retweeted_status`` dictionary.
  
To illustrate the point, here is a shortened version of what the a tweet object of a retweet looks like (we've made the information up to not accidentally display personal information protected by the GDPR, so it's not actually possible to look this tweet up.)

```
{'created_at': 'Fri May 28 19:46:51 +0000 2021',
 'id': 1398312371249155565,
 'id_str': '1398312371249155565',
 'full_text': 'RT @hello_summer: Das ist ein Beispiel Tweet über Impfungen und Corona…',
 'truncated': False,
 'display_text_range': [0, 140],

 ...
 
 'user': {'id': 1375388000040222754,
  'id_str': '1375388000040222754',
  'name': 'Erika Musterfrau',
  'screen_name': 'EMusterfrau',
  'location': '',
 ...
          
 'retweeted_status': {'created_at': 'Fri May 28 19:34:49 +0000 2021',
  'id': 1333360002156410885,
  'id_str': '1333360002156410885',
  'full_text': 'Das ist ein Beispiel Tweet über Impfungen und Corona #Pfizer #Astra #Impfung #Stiko',
  'truncated': False,
 ...
  'user': {'id': 762755574677764929,
   'id_str': '762755574677764929',
   'name': 'Summer',
   'screen_name': 'hello_summer',
   'location': 'BW',

   ...
```

2. **Quotes/quoted tweets**: Quotes are retweets, to which the user retweeting has added their own comment. These kind of tweets pose a particular challenge since we have to decide: Do we just keep the comment that was added by the user? Do we add the quoted text and the user's comment up in one long string? Do we just keep what the quoted tweet originally said (that would mean treating the quotes as retweets)?
We've discussed the implications of the different approaches and in the end decided to only keep the user's comment since this will be most relevant for the topic modelling and the classification task. However, we are aware that this means missing out on the context of the user's comment since they usually refer to the contents of the quoted tweet. We therefore add a column that contains the original quoted tweet so that we can refer back to them if necessary.

In [None]:
# retrieving the tweet text (for the topic modelling which we'll do in another notebook)
# we define a function that we can use for all three datasets

def tweets(list_of_final_tweets):

    """
    Returns user, date, tweet text and retweet information as pandas df.
    
        Input: List containing tweets (tweet format: dict of dicts)

        Output: Two pandas dataframes. The first one contains information on all tweets (original and retweet),
        and has the columns user, date, text and retweet. The second dataframe contains information on all
        original tweets, and has the columns user, date and text.
    """
    
    # empty lists to store tweet text in, we create two different lists: 
    # all_tweets: one for all the original and the retweets
    # original: and one for only original tweets
    all_tweets = []
    original = []

    # empty sets to remove further duplicates
    all_ids = set()
    original_ids = set()
    
    # iterate through the tweets list
    for tweet in list_of_final_tweets:

        # if the tweet object has a dict key 'retweeted_status'
        if 'retweeted_status' in tweet:

            # retrieve the user retweeting and the retweeted text
            id_text = tweet['id_str']
            user = tweet['user']['screen_name']
            date = tweet['created_at']
            text = tweet['retweeted_status']['full_text']
            
            # add the retweeted text as an "original" tweet incl. user/id/date information as well
            id_rt = tweet['retweeted_status']['id_str']
            user_rt = tweet['retweeted_status']['user']['screen_name']
            date_rt = tweet['retweeted_status']['created_at']
            text_rt = tweet['retweeted_status']['full_text']
            
            # if the ID of the retweet is not in the all_ids set yet
            if id_text not in all_ids:
                
                # append the information (id, user handle, date, tweet text, is retweet, is quote, quote text) 
                # as a tuple to the list
                all_tweets.append((id_text,user,date,text,1,0,np.nan))
            
            # if the ID of the retweeted tweet is not yet in all_ids
            if id_rt not in all_ids:
                
                # append the information (id, user handle, date, tweet text, is retweet, is quote, quote text) 
                # as a tuple to the list
                all_tweets.append((id_rt,user_rt,date_rt,text_rt,0,0,np.nan))
            
            # if the ID of the retweeted tweet is not yet in the original_id set
            if id_rt not in original_ids:
                
                # append the retweeted information (which we treat as "original" tweets as well) to the 'original' list
                original.append((id_rt, user_rt, date_rt, text_rt))
            
            # add the ids to the sets
            all_ids.update([id_text, id_rt])
            original_ids.add(id_rt)
        
        elif 'quoted_status' in tweet:
            
            # retrieve the user quoting and the quoted text
            id_text = tweet['id_str']
            user = tweet['user']['screen_name']
            date = tweet['created_at']
            text = tweet['full_text']
            
            # add the retweeted text as an "original" tweet incl. user/id/date information as well
            id_qt = tweet['quoted_status']['id_str']
            user_qt = tweet['quoted_status']['user']['screen_name']
            date_qt = tweet['quoted_status']['created_at']
            text_qt = tweet['quoted_status']['full_text']
            
            # if the ID of the quote is not in the all_ids set yet
            if id_text not in all_ids:
                
                # append the information (id, user handle, date, tweet text, is retweet, is quote, quote text) 
                # as a tuple to the list
                all_tweets.append((id_text,user,date,text,0,1,text_qt))
            
            # if the ID of the retweeted tweet is not yet in all_ids
            if id_qt not in all_ids:
                
                # append the information (id, user handle, date, tweet text, is retweet, is quote, quote text) 
                # as a tuple to the list
                all_tweets.append((id_qt,user_qt,date_qt,text_qt,0,0,np.nan))
            
            # if the ID of the retweeted tweet is not yet in the original_id set
            if id_qt not in original_ids:
                
                # append the retweeted information (which we treat as "original" tweets as well) to the 'original' list
                original.append((id_qt,user_qt,date_qt,text_qt))
            
            # add the ids to the sets
            all_ids.update([id_text, id_qt])
            original_ids.add(id_qt)
            

        # if the tweet is not a retweet
        else: 

            # save the user who posted this tweet and the text
            id_text = tweet['id_str']
            user = tweet['user']['screen_name']
            date = tweet['created_at']
            text = tweet['full_text']

            # if the ID of the tweet is not yet in the all_ids set
            if id_text not in all_ids:
   
                # append the information (user handle, date, tweet text, is retweet, is quote, quote text) 
                # as a tuple to the tweet_tx list
                all_tweets.append((id_text,user,date,text,0,0,np.nan))
    
            # if the ID of the tweet is not yet in the original_ids set
            if id_text not in original_ids:
                
                # append the information to the 'original' list: we do this to be able to distinguish between
                # retweets and original tweets
                original.append((id_text,user,date,text))
            
            # add the ids to the sets
            all_ids.add(id_text)
            original_ids.add(id_text)
            
    # turning the lists into pandas dataframes
    all_df = pd.DataFrame(all_tweets, columns = ['id','user','date','text','is_retweet','is_quote','quote'])
    original_df = pd.DataFrame(original, columns = ['id','user', 'date', 'text'])

    # turn the date column into pandas datetime
    all_df['date'] = pd.to_datetime(all_df['date'])
    original_df['date'] = pd.to_datetime(original_df['date'])
    
    return all_df, original_df

In [None]:
# applying the function to the different datasets

# German data
de_all_text, de_original_text = tweets(de_final)

# Danish data
da_all_text, da_original_text = tweets(da_final)

# Polish data
pl_all_text, pl_original_text = tweets(pl_final)

### Checking if any irrelevant tweets are left

In [None]:
# checking if there are any irrelevant tweets left

# Germany
de = 0
for tweet_id in de_all_text['id']:
    if tweet_id in de_irr:
        de += 1
print(f'There are {de} issues with irrelevant tweets in the German data.')

# Denmark
da = 0
for tweet_id in da_all_text['id']:
    if tweet_id in da_irr:
        da += 1
print(f'There are {da} issues with irrelevant tweets in the Danish data.')

# Poland
pl = 0
for tweet_id in pl_all_text['id']:
    if tweet_id in pl_irr:
        pl += 1
print(f'There are {pl} issues with irrelevant tweets in the Polish data.')

### Checking if there are any duplicates left

In [None]:
# function to check if we successfully removed all duplicates

def check_duplicates(pd_dataframe):
        
    """Takes dataframe as input, prints string comparing the length of the dataframe with the number of unique IDs."""
        
    check_set = set()
    
    for i in pd_dataframe['id']:
        check_set.add(i)
        
    print(f"There are {pd_dataframe.shape[0]} tweets in the dataframe and {len(check_set)} of them are unique.")

In [None]:
# apply the function to the different dataframes
print(f"German data all: {check_duplicates(de_all_text)}"), print(f"German data original: {check_duplicates(de_original_text)}")
print(f"Danish data all: {check_duplicates(da_all_text)}"), print(f"Danish data original: {check_duplicates(da_original_text)}")
print(f"Polish data all: {check_duplicates(pl_all_text)}"), print(f"Polish data original: {check_duplicates(pl_original_text)}")

### Checking the length of the dataframe and displaying the first few rows

In [None]:
# print the length of the df and its first few rows
for df in [de_all_text, da_all_text, pl_all_text]:
    print(f"There are {len(df)} tweets incl. retweets in this dataframe.")
    display(df.head(3))

In [None]:
# print the length of the df and its first few rows
for df in [de_original_text, da_original_text, pl_original_text]:
    print(f"There are {len(df)} original tweets in this dataframe.")
    display(df.head(3))

### Checking the time period from which we've collected tweets

In [None]:
# retrieving the time period, i.e. the largest and smallest date
time_df_list = [de_original_text, da_original_text, pl_original_text]

language_list = ['German', 'Danish','Polish']

for i in range(len(time_df_list)):
    print(f"{language_list[i]}: For the original tweets, the time period starts on {min(time_df_list[i]['date'])} and ends on {max(time_df_list[i]['date'])}.")

# 2: Twitter interactions (retweets, quotes, mentions, replies)
Here we create edgelists for retweets, quotes, mentions and replies. We are not yet sure if we need all of this information, or whether retweets will be enough, but we will extract all communicative interactions just in case we want to use them later.

In [None]:
def interactions(list_of_final_tweets): 
    
    """
    Returns edgelist for retweets, quotes, replies and mentions.
    
        Input: List containing tweets (tweet format: dict of dicts)

        Output: Four pandas dataframes. Each dataframe contains the source and target user of a communicative interaction,
        i.e. retweets, quotes, replies and mentions respectively.
    """

    # empty lists to store the retweet, quote, mention and reply information in
    rt = []
    quote = []
    reply = []
    mention = []

    for tweet in list_of_final_tweets:

        # retweets
        if 'retweeted_status' in tweet.keys():
            source = tweet['user']['screen_name']
            target = tweet['retweeted_status']['user']['screen_name']

            # add info to list
            rt.append((source,target))

        else:
            pass

        # quotes
        if 'quoted_status' in tweet:
            source = tweet['user']['screen_name']
            target = tweet['quoted_status']['user']

            # add info to list
            quote.append((source,target))

        else:
            pass

        # replies
        if tweet['in_reply_to_screen_name']:
            source = tweet['user']['screen_name']
            target = tweet['in_reply_to_screen_name']

            # add info to list
            reply.append((source,target))
        
        else:
            pass


        # mentions

        # check if the 'user_mentions' exists in the 'entities' dict
        if tweet['entities']['user_mentions']:

            # 'user_mentions' is a dict containing dicts for each user mentioned in the tweet text
            # therefore, we retrieve the number of mentions (= length of the 'user_mentions' dict to catch them all)
            number_mentions = range(len(tweet['entities']['user_mentions']))

            # iterate through the number of mentions
            for j in number_mentions:

                source = tweet['user']['screen_name']
                target = tweet['entities']['user_mentions'][j]['screen_name']

                # add info to list
                mention.append((source,target)) 
        
        else:
            pass
    
    # turning the lists into pandas dataframes
    rt = pd.DataFrame(rt, columns = ['source', 'target'])
    quote = pd.DataFrame(quote, columns = ['source', 'target'])
    reply = pd.DataFrame(reply, columns = ['source', 'target'])
    mention = pd.DataFrame(mention, columns = ['source', 'target'])
    
    return rt, quote, reply, mention

In [None]:
# applying the function to the different datasets

# German data
# de_edgelist_rt, de_edgelist_quote, de_edgelist_reply, de_edgelist_mention = interactions(de_final)

# Danish data
da_edgelist_rt, da_edgelist_quote, da_edgelist_reply, da_edgelist_mention = interactions(da_final)

# Polish data
# pl_edgelist_rt, pl_edgelist_quote, pl_edgelist_reply, pl_edgelist_mention = interactions(pl_final)

### Saving the dataframes
We save all the different dataframes we've created as csv files.

In [None]:
# saving the different dataframes as files

df_list = [
#     de_all_text, de_original_text, de_edgelist_rt, de_edgelist_quote, de_edgelist_reply, de_edgelist_mention,
    da_all_text, da_original_text, da_edgelist_rt, da_edgelist_quote, da_edgelist_reply, da_edgelist_mention,
#     pl_all_text, pl_original_text, pl_edgelist_rt, pl_edgelist_quote, pl_edgelist_reply, pl_edgelist_mention
]

fil_name_list = [
#     'de_all_text', 'de_original_text', 'de_edgelist_rt', 'de_edgelist_quote', 'de_edgelist_reply', 'de_edgelist_mention',
    'da_all_text', 'da_original_text', 'da_edgelist_rt', 'da_edgelist_quote', 'da_edgelist_reply', 'da_edgelist_mention',
#     'pl_all_text', 'pl_original_text', 'pl_edgelist_rt', 'pl_edgelist_quote', 'pl_edgelist_reply', 'pl_edgelist_mention'
]

for i in range(len(df_list)):
    df_list[i].to_csv(f"final_data_prepare1\\{fil_name_list[i]}.csv", index=False)