# Analysis of Tweets from a full archival search

In [25]:
import pandas as pd
from os.path import join
import numpy as np

In [5]:
src = '../data'
hashtags = ['WissZeitVG', 'FristIstFrust', '95vsWissZeitVG', 'IchBinHannah']

Note: if the Tweets have already been downloaded and the Tweet data exists in the folder ```data``` as compressed ```.jsonl``` files, you can skip the "Query tweets" and "Compress data" steps and start processing at "Decompress data".

## Collect Tweets

### Query tweets

Note: the queries are saved in separate files. I do this to make the data collection process reproducible by saving the exact query parameters for every data file.

In [None]:
# change file permissions such that execution is allowed
! chmod +x ./queries/video_url.sh
# run the query. Note: this can take a while, depending on the number of Tweets
# that need to be downloaded
! ./queries/video_url.sh

In [226]:
! chmod +x ./queries/95vsWissZeitVG.sh
! ./queries/95vsWissZeitVG.sh

In [226]:
! chmod +x ./queries/FristIstFrust.sh
! ./queries/FristIstFrust.sh

In [226]:
! chmod +x ./queries/WissZeitVG.sh
! ./queries/WissZeitVG.sh

In [271]:
# note: the more prevalent hashtag is "#IchBinHanna" (without the "h" at the end)
# the query for "#IchBinHannah" (and it's variants) includes the variant without
# the "h", since it's a substring.
! chmod +x ./queries/IchBinHannah.sh
! ./queries/IchBinHannah.sh

### Compress data

Note: under windows, .xz files can be decompressed for examply with [WinZIP](https://www.winzip.com/win/en/xz-file.html).

In [None]:
# the parameter "-k" keeps the original file
! xz -k ../data/95vsWissZeitVG.jsonl
! xz -k ../data/FristIstFrust.jsonl
! xz -k ../data/WissZeitVG.jsonl
! xz -k ../data/IchBinHannah.jsonl
! xz -k ../data/video_url.jsonl

### Decompress data

In [None]:
! xz -d ../data/95vsWissZeitVG.jsonl.xz
! xz -d ../data/FristIstFrust.jsonl.xz
! xz -d ../data/WissZeitVG.jsonl.xz
! xz -d ../data/IchBinHannah.jsonl.xz
! xz -d ../data/video_url.jsonl.xz

### Convert to CSV

Removes duplicate tweets (by ID) but keeps referenced tweets.

In [42]:
! twarc2 csv ../data/IchBinHannah.jsonl ../data/IchBinHannah.csv

100%|██████████████████████████████████████| 38.0M/38.0M [00:09<00:00, 4.12MB/s]

ℹ️
Read 40193 tweets from 43 lines. 
18770 were referenced tweets, 18192 were duplicates.
Wrote 22001 rows and output 89 of 89 input columns in the CSV.



In [40]:
! twarc2 csv ../data/95vsWissZeitVG.jsonl ../data/95vsWissZeitVG.csv

100%|██████████████████████████████████████| 19.4M/19.4M [00:06<00:00, 3.12MB/s]

ℹ️
Read 25382 tweets from 27 lines. 
12315 were referenced tweets, 11555 were duplicates.
Wrote 13827 rows and output 89 of 89 input columns in the CSV.



In [41]:
! twarc2 csv ../data/FristIstFrust.jsonl ../data/FristIstFrust.csv

100%|██████████████████████████████████████| 22.4M/22.4M [00:07<00:00, 3.13MB/s]

ℹ️
Read 27442 tweets from 29 lines. 
13034 were referenced tweets, 11760 were duplicates.
Wrote 15682 rows and output 89 of 89 input columns in the CSV.



In [43]:
! twarc2 csv ../data/video_url.jsonl ../data/video_url.csv

100%|██████████████████████████████████████| 2.17M/2.17M [00:00<00:00, 3.73MB/s]

ℹ️
Read 2698 tweets from 3 lines. 
1309 were referenced tweets, 1265 were duplicates.
Wrote 1433 rows and output 89 of 89 input columns in the CSV.



In [58]:
! twarc2 csv ../data/WissZeigVG.jsonl ../data/WissZeigVG.csv

100%|██████████████████████████████████████| 14.8M/14.8M [00:04<00:00, 3.37MB/s]

ℹ️
Read 17386 tweets from 19 lines. 
8355 were referenced tweets, 7584 were duplicates.
Wrote 9802 rows and output 89 of 89 input columns in the CSV.



## Extract conversation IDs

In [232]:
def get_conversation_IDs(src, filename, chunks=False, chunksize=1000):
    data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    conversationIDs = data['conversation_id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets from {} conversations'\
              .format(filename, len(data), len(conversationIDs)))
    
    dst = join(src, 'conversation_IDs')
    
    if chunks:
        N_chunks = len(conversationIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = conversationIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(conversationIDs))),
                conversationIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_ConversationIDs.txt'.format(filename)),
                   conversationIDs, fmt='%d')

In [234]:
get_conversation_IDs(src, 'IchBinHannah', chunks=True, chunksize=10)

IchBinHannah: There are 22001 Tweets from 20698 conversations
2069


In [221]:
get_conversation_IDs(src, 'FristIstFrust', chunks=True)

FristIstFrust: There are 15682 Tweets from 14802 conversations


In [222]:
get_conversation_IDs(src, '95vsWissZeitVG', chunks=True)

95vsWissZeitVG: There are 13827 Tweets from 13070 conversations


In [223]:
get_conversation_IDs(src, 'WissZeitVG', chunks=True)

WissZeitVG: There are 9802 Tweets from 8923 conversations


In [224]:
get_conversation_IDs(src, 'video_url')

video_url: There are 1433 Tweets from 1388 conversations


## Extract Tweet IDs

In [242]:
def get_Tweet_IDs(src, filename, chunks=False, chunksize=1000):
    data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    TweetIDs = data['id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets'\
              .format(filename, len(TweetIDs)))
    
    dst = join(src, 'tweet_IDs')
    
    if chunks:
        N_chunks = len(TweetIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = TweetIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(TweetIDs))),
                TweetIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_TweetIDs.txt'.format(filename)),
                   TweetIDs, fmt='%d')

In [246]:
get_Tweet_IDs(src, 'IchBinHannah', chunks=False)

IchBinHannah: There are 22001 Tweets


In [244]:
get_Tweet_IDs(src, 'FristIstFrust', chunks=False)

FristIstFrust: There are 15682 Tweets


In [245]:
get_Tweet_IDs(src, '95vsWissZeitVG', chunks=False)

95vsWissZeitVG: There are 13827 Tweets


In [247]:
get_Tweet_IDs(src, 'WissZeitVG', chunks=False)

WissZeitVG: There are 9802 Tweets


In [256]:
get_Tweet_IDs(src, 'video_url', chunks=False)

video_url: There are 1433 Tweets
