# Analysis of Tweets from a full archival search

In [4]:
import pandas as pd
from os.path import join
import numpy as np

In [7]:
src = '../data/tweets'
# see folder /queries for the corresponding .sh files that use twarc to query
# tweets from the Twitter v2 API
queries = [
    '95vsWissZeitVG', # secondary relevance
    'ACertainDegreeOfFlexibility', # secondary relevance
    'bundestagswahl', # tertiary relevance
    'ESC', # tertiary relevance
    'euro', # tertiary relevance
    'FristIstFrust', # secondary relevance
    'HannaImBundestag', # primary relevance
    'Hanna_Others', # secondary relevance, combination of IchBinNichtHanna, IchBinMelek and IchBinJelena
    'IchBinHanna', # primary relevance
    'IchBinJelena', # secondary relevance
    'IchBinMelek', # secondary relevance
    'IchBinNichtHanna', # secondary relevance
    'video_url', # tertiary secondary relevance
    'WissZeitVG' # secondary relevance
]

Note: if the Tweets have already been downloaded and the Tweet data exists in the folder ```data``` as compressed ```.jsonl``` files, you can skip the "Query tweets" and "Compress data" steps and start processing at "Decompress data".

## Collect Tweets

### Query tweets

Note: the queries are saved in separate files. I do this to make the data collection process reproducible by saving the exact query parameters for every data file. Before being able to execute a `.sh` file, the file permissions have to be changed to add execution permissions  

`chmod +x ./queries/query.sh`

Then navigate to the `/queries` folder and run  

`./query.sh`  

Note: this can take a while, depending on the number of Tweets that need to be downloaded.

### Compress data

Note: under windows, .xz files can be decompressed for examply with [WinZIP](https://www.winzip.com/win/en/xz-file.html).

In [None]:
# the parameter "-k" keeps the original file
! xz -k ../data/tweets/95vsWissZeitVG.jsonl
! xz -k ../data/tweets/ACertainDegreeOfFlexibility.jsonl
#! xz -k ../data/tweets/bundestagswahl.jsonl
#! xz -k ../data/tweets/ESC.jsonl
#! xz -k ../data/tweets/euro.jsonl
! xz -k ../data/tweets/FristIstFrust.jsonl
! xz -k ../data/tweets/HannaImBundestag.jsonl
! xz -k ../data/tweets/Hanna_Others.jsonl
! xz -k ../data/tweets/IchBinHanna.jsonl
! xz -k ../data/tweets/IchBinJelena.jsonl
! xz -k ../data/tweets/IchBinMelek.jsonl
! xz -k ../data/tweets/IchBinNichtHanna.jsonl
! xz -k ../data/tweets/video_url.jsonl
! xz -k ../data/tweets/WissZeitVG.jsonl

### Decompress data

In [None]:
! xz -d ../data/tweets/95vsWissZeitVG.jsonl.xz
! xz -d ../data/tweets/ACertainDegreeOfFlexibility.jsonl.xz
! xz -d ../data/tweets/bundestagswahl.jsonl.xz
! xz -d ../data/tweets/ESC.jsonl.xz
! xz -d ../data/tweets/euro.jsonl.xz
! xz -d ../data/tweets/FristIstFrust.jsonl.xz
! xz -d ../data/tweets/HannaImBundestag.jsonl.xz
! xz -d ../data/tweets/Hanna_Others.jsonl.xz
! xz -d ../data/tweets/IchBinHanna.jsonl.xz
! xz -d ../data/tweets/IchBinJelena.jsonl.xz
! xz -d ../data/tweets/IchBinMelek.jsonl.xz
! xz -d ../data/tweets/IchBinNichtHanna.jsonl.xz
! xz -d ../data/tweets/video_url.jsonl.xz
! xz -d ../data/tweets/WissZeitVG.jsonl.xz

### Convert to CSV

Removes duplicate tweets (by ID) but keeps referenced tweets.

In [99]:
! twarc2 csv ../data/tweets/95vsWissZeitVG.jsonl ../data/95vsWissZeitVG.csv
! twarc2 csv ../data/tweets/ACertainDegreeOfFlexibility.jsonl ../data/ACertainDegreeOfFlexibility.csv
! twarc2 csv ../data/tweets/bundestagswahl.jsonl ../data/bundestagswahl.csv
! twarc2 csv ../data/tweets/ESC.jsonl ../data/ESC.csv
#! twarc2 csv ../data/tweets/euro.jsonl ../data/euro.csv
! twarc2 csv ../data/tweets/FristIstFrust.jsonl ../data/FristIstFrust.csv
! twarc2 csv ../data/tweets/HannaImBundestag.jsonl ../data/HannaImBundestag.csv
! twarc2 csv ../data/tweets/Hanna_Others.jsonl ../data/Hanna_Others.csv
! twarc2 csv ../data/tweets/IchBinHanna.jsonl ../data/IchBinHanna.csv
! twarc2 csv ../data/tweets/IchBinJelena.jsonl ../data/IchBinJelena.csv
! twarc2 csv ../data/tweets/IchBinMelek.jsonl ../data/IchBinMelek.csv
! twarc2 csv ../data/tweets/IchBinNichtHanna.jsonl ../data/IchBinNichtHanna.csv
! twarc2 csv ../data/tweets/video_url.jsonl ../data/video_url.csv
! twarc2 csv ../data/tweets/WissZeigVG.jsonl ../data/WissZeigVG.csv

100%|██████████████████████████████████████| 21.3M/21.3M [00:07<00:00, 3.07MB/s]

ℹ️
Read 26950 tweets from 29 lines. 
13076 were referenced tweets, 12244 were duplicates.
Wrote 14706 rows and output 89 of 89 input columns in the CSV.

100%|██████████████████████████████████████| 19.3M/19.3M [00:06<00:00, 3.17MB/s]

ℹ️
Read 23443 tweets from 25 lines. 
11331 were referenced tweets, 10271 were duplicates.
Wrote 13172 rows and output 89 of 89 input columns in the CSV.

100%|████████████████████████████████████████| 455M/455M [01:59<00:00, 4.01MB/s]

ℹ️
Read 377626 tweets from 422 lines. 
172491 were referenced tweets, 146019 were duplicates.
Wrote 231607 rows and output 89 of 89 input columns in the CSV.

100%|████████████████████████████████████████| 105M/105M [00:18<00:00, 5.78MB/s]

ℹ️
Read 52325 tweets from 79 lines. 
14227 were referenced tweets, 12369 were duplicates.
Wrote 39956 rows and output 89 of 89 input columns in the CSV.

100%|██████████████████████████████████████| 23.9M/

## Extract conversation IDs

In [100]:
def get_conversation_IDs(src, dst, filename, chunks=False, chunksize=1000):
    try:
        data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    except FileNotFoundError:
        print('WARNING: no tweets found for query "{}"'.format(query))
        return
    
    conversationIDs = data['conversation_id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets from {} conversations'\
              .format(filename, len(data), len(conversationIDs)))
    
    if chunks:
        N_chunks = len(conversationIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = conversationIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(conversationIDs))),
                conversationIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_ConversationIDs.txt'.format(filename)),
                   conversationIDs, fmt='%d')

In [101]:
dst = '../data/conversation_IDs'
for query in queries:
    get_conversation_IDs(src, dst, query)

95vsWissZeitVG: There are 14532 Tweets from 13746 conversations
ACertainDegreeOfFlexibility: There are 13099 Tweets from 12216 conversations
bundestagswahl: There are 231607 Tweets from 209035 conversations
ESC: There are 39956 Tweets from 37594 conversations
FristIstFrust: There are 15988 Tweets from 15089 conversations
HannaImBundestag: There are 6465 Tweets from 6203 conversations
Hanna_Others: There are 47 Tweets from 42 conversations
IchBinHanna: There are 61554 Tweets from 57545 conversations
IchBinJelena: There are 76 Tweets from 74 conversations
IchBinMelek: There are 59 Tweets from 59 conversations
IchBinNichtHanna: There are 15 Tweets from 12 conversations
video_url: There are 1473 Tweets from 1423 conversations
WissZeitVG: There are 12198 Tweets from 11095 conversations


## Extract Tweet IDs

In [122]:
def get_Tweet_IDs(src, dst, filename, chunks=False, chunksize=1000):
    try:
        data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    except FileNotFoundError:
        print('WARNING: no tweets found for query "{}"'.format(query))
        return
    
    TweetIDs = data['id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets'\
              .format(filename, len(TweetIDs)))
    
    if chunks:
        N_chunks = len(TweetIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = TweetIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(TweetIDs))),
                TweetIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_TweetIDs.txt'.format(filename)),
                   TweetIDs, fmt='%d')

In [123]:
dst = '../data/tweet_IDs'
for query in queries:
    get_Tweet_IDs(src, dst, query)

95vsWissZeitVG: There are 14532 Tweets
ACertainDegreeOfFlexibility: There are 13099 Tweets
FristIstFrust: There are 15988 Tweets
HannaImBundestag: There are 6465 Tweets
Hanna_Others: There are 47 Tweets
IchBinHanna: There are 61554 Tweets
IchBinJelena: There are 76 Tweets
IchBinMelek: There are 59 Tweets
IchBinNichtHanna: There are 15 Tweets
WissZeitVG: There are 12198 Tweets


## Sanity check data

In [110]:
def extract_hashtags(tagstring):
    if tagstring == tagstring:
        list_of_dicts = eval(tagstring)
        hashtags = []
        for dct in list_of_dicts:
            tag = dct['tag']
            hashtags.append(tag)
        return hashtags
    else:
        return np.nan
    
def extract_reference_type(referencestring):
    if referencestring == referencestring:
        referencestring = referencestring.replace('false', 'False')
        referencestring = referencestring.replace('true', 'True')
        # almost all tweets reference a single other tweets. A minority reference
        # more than one. In these cases we use the reference type of the first
        # reference
        dct = eval(referencestring)[0]
        return dct['type']
    else:
        return 'no_reference'
    
def check_wanted_tag(tweet_tags):
    if tweet_tags == tweet_tags: # nan-check
        if len(wanted_tags.intersection(set(tweet_tags))) == 0:
            return False
        else:
            return True
    else:
        return False

In [111]:
queries = [
    '95vsWissZeitVG', # secondary relevance
    'ACertainDegreeOfFlexibility', # secondary relevance
    'FristIstFrust', # secondary relevance
    'HannaImBundestag', # primary relevance
    'Hanna_Others', # secondary relevance, combination of IchBinNichtHanna, IchBinMelek and IchBinJelena
    'IchBinHanna', # primary relevance
    'IchBinJelena', # secondary relevance
    'IchBinMelek', # secondary relevance
    'IchBinNichtHanna', # secondary relevance
    'WissZeitVG' # secondary relevance
]

In [121]:
for query in queries:
    print(query)
    
    # read the search query hasthags
    wanted_tags = []
    with open(join('hashtags', '{}.txt'.format(query)), 'r') as f:
        for line in f.readlines():
            wanted_tags.append(line.strip('\n'))
    wanted_tags = set(wanted_tags)
        
    tweets = pd.read_csv(join('../data/tweets', '{}.csv'.format(query)),
                         low_memory=False)
    tweets['hashtags'] = tweets['entities.hashtags'].apply(extract_hashtags)
    tweets['reference_type'] = tweets['referenced_tweets'].apply(extract_reference_type)
    tweets['wanted_tag'] = tweets['hashtags'].apply(check_wanted_tag)
    
    for reference_type in tweets['reference_type'].unique():
        subset = tweets[tweets['reference_type'] == reference_type]
        tag_found = subset[subset['wanted_tag'] == True]
        tag_not_found = subset[subset['wanted_tag'] == False]
        print('\t*************')
        print('\treference type: {}'.format(reference_type))
        print('\tNumber of tweets with a wanted tag: {}'.format(len(tag_found)))
        print('\tNumber of tweets without a wanted tag: {}'.format(len(tag_not_found)))
        print()
        
    tweets.to_csv(join('../data/tweets', '{}.csv'.format(query)), index=False)

95vsWissZeitVG
	*************
	reference type: no_reference
	Number of tweets with a wanted tag: 820
	Number of tweets without a wanted tag: 330

	*************
	reference type: retweeted
	Number of tweets with a wanted tag: 2577
	Number of tweets without a wanted tag: 8750

	*************
	reference type: quoted
	Number of tweets with a wanted tag: 1042
	Number of tweets without a wanted tag: 131

	*************
	reference type: replied_to
	Number of tweets with a wanted tag: 526
	Number of tweets without a wanted tag: 356

ACertainDegreeOfFlexibility
	*************
	reference type: no_reference
	Number of tweets with a wanted tag: 786
	Number of tweets without a wanted tag: 483

	*************
	reference type: retweeted
	Number of tweets with a wanted tag: 2209
	Number of tweets without a wanted tag: 7373

	*************
	reference type: replied_to
	Number of tweets with a wanted tag: 656
	Number of tweets without a wanted tag: 399

	*************
	reference type: quoted
	Number of t