# Analysis of Tweets from a full archival search

In [25]:
import pandas as pd
from os.path import join
import numpy as np

In [349]:
src = '../data/tweets'
hashtags = ['WissZeitVG', 'FristIstFrust', '95vsWissZeitVG', 'IchBinHanna', 
            'ACertainDegreeOfFlexibility', 'IchBinJelena', 'IchBinMelek',
            'IchBinNichtHanna']

Note: if the Tweets have already been downloaded and the Tweet data exists in the folder ```data``` as compressed ```.jsonl``` files, you can skip the "Query tweets" and "Compress data" steps and start processing at "Decompress data".

## Collect Tweets

### Query tweets

Note: the queries are saved in separate files. I do this to make the data collection process reproducible by saving the exact query parameters for every data file.

In [301]:
# change file permissions such that execution is allowed
! chmod +x ./queries/video_url.sh
# run the query. Note: this can take a while, depending on the number of Tweets
# that need to be downloaded
! ./queries/video_url.sh

In [302]:
! chmod +x ./queries/95vsWissZeitVG.sh
! ./queries/95vsWissZeitVG.sh

In [303]:
! chmod +x ./queries/FristIstFrust.sh
! ./queries/FristIstFrust.sh

In [304]:
! chmod +x ./queries/WissZeitVG.sh
! ./queries/WissZeitVG.sh

In [324]:
# note: the more prevalent hashtag is "#IchBinHanna" (without the "h" at the end)
# the query for "#IchBinHannah" (and it's variants) includes the variant without
# the "h", since it's a substring.
! chmod +x ./queries/IchBinHanna.sh
! ./queries/IchBinHanna.sh

In [306]:
! chmod +x ./queries/ACertainDegreeOfFlexibility.sh
! ./queries/ACertainDegreeOfFlexibility.sh

In [307]:
! chmod +x ./queries/ACertainDegreeOfFlexibility.sh
! ./queries/ACertainDegreeOfFlexibility.sh

In [308]:
! chmod +x ./queries/IchBinJelena.sh
! ./queries/IchBinJelena.sh

In [309]:
! chmod +x ./queries/IchBinMelek.sh
! ./queries/IchBinMelek.sh

In [310]:
! chmod +x ./queries/IchBinNichtHanna.sh
! ./queries/IchBinNichtHanna.sh

### Compress data

Note: under windows, .xz files can be decompressed for examply with [WinZIP](https://www.winzip.com/win/en/xz-file.html).

In [312]:
# the parameter "-k" keeps the original file
! xz -k ../data/tweets/95vsWissZeitVG.jsonl
! xz -k ../data/tweets/FristIstFrust.jsonl
! xz -k ../data/tweets/WissZeitVG.jsonl
! xz -k ../data/tweets/IchBinHanna.jsonl
! xz -k ../data/tweets/video_url.jsonl
! xz -k ../data/tweets/ACertainDegreeOfFlexibility.jsonl
! xz -k ../data/tweets/IchBinJelena.jsonl
! xz -k ../data/tweets/IchBinMelek.jsonl
! xz -k ../data/tweets/IchBinNichtHanna.jsonl

xz: ../data/WissZeitVG.jsonl.xz: File exists
xz: ../data/video_url.jsonl.xz: File exists
xz: ../data/IchBinJelena.jsonl.xz: File exists
xz: ../data/IchBinMelek.jsonl.xz: File exists
xz: ../data/IchBinNichtHanna.jsonl.xz: File exists


### Decompress data

In [284]:
! xz -d ../data/tweets/95vsWissZeitVG.jsonl.xz
! xz -d ../data/tweets/FristIstFrust.jsonl.xz
! xz -d ../data/tweets/WissZeitVG.jsonl.xz
! xz -d ../data/tweets/IchBinHanna.jsonl.xz
! xz -d ../data/tweets/ACertainDegreeOfFlexibility.jsonl.xz
! xz -d ../data/tweets/IchBinJelena.jsonl.xz
! xz -d ../data/tweets/IchBinMelek.jsonl.xz
! xz -d ../data/tweets/IchBinNichtHanna.jsonl.xz

xz: ../data/95vsWissZeitVG.jsonl: File exists
xz: ../data/FristIstFrust.jsonl: File exists
xz: ../data/WissZeitVG.jsonl: File exists
xz: ../data/IchBinHannah.jsonl: File exists
xz: ../data/IchBinMelek.jsonl.xz: No such file or directory


### Convert to CSV

Removes duplicate tweets (by ID) but keeps referenced tweets.

In [326]:
! twarc2 csv ../data/tweets/IchBinHanna.jsonl ../data/IchBinHanna.csv

100%|██████████████████████████████████████| 47.9M/47.9M [00:12<00:00, 3.99MB/s]

ℹ️
Read 51580 tweets from 55 lines. 
24246 were referenced tweets, 23424 were duplicates.
Wrote 28156 rows and output 89 of 89 input columns in the CSV.



In [316]:
! twarc2 csv ../data/tweets/95vsWissZeitVG.jsonl ../data/95vsWissZeitVG.csv

100%|██████████████████████████████████████| 20.0M/20.0M [00:06<00:00, 3.00MB/s]

ℹ️
Read 26072 tweets from 28 lines. 
12653 were referenced tweets, 11875 were duplicates.
Wrote 14197 rows and output 89 of 89 input columns in the CSV.



In [317]:
! twarc2 csv ../data/tweets/FristIstFrust.jsonl ../data/FristIstFrust.csv

100%|██████████████████████████████████████| 22.7M/22.7M [00:07<00:00, 3.16MB/s]

ℹ️
Read 27714 tweets from 30 lines. 
13166 were referenced tweets, 11876 were duplicates.
Wrote 15838 rows and output 89 of 89 input columns in the CSV.



In [318]:
! twarc2 csv ../data/tweets/video_url.jsonl ../data/video_url.csv

100%|██████████████████████████████████████| 2.21M/2.21M [00:00<00:00, 3.59MB/s]

ℹ️
Read 2769 tweets from 3 lines. 
1343 were referenced tweets, 1296 were duplicates.
Wrote 1473 rows and output 89 of 89 input columns in the CSV.



In [319]:
! twarc2 csv ../data/tweets/WissZeigVG.jsonl ../data/WissZeigVG.csv

100%|██████████████████████████████████████| 15.8M/15.8M [00:05<00:00, 3.20MB/s]

ℹ️
Read 18611 tweets from 20 lines. 
8947 were referenced tweets, 8109 were duplicates.
Wrote 10502 rows and output 89 of 89 input columns in the CSV.



In [320]:
! twarc2 csv ../data/tweets/ACertainDegreeOfFlexibility.jsonl ../data/ACertainDegreeOfFlexibility.csv

100%|██████████████████████████████████████| 18.5M/18.5M [00:06<00:00, 3.02MB/s]

ℹ️
Read 23025 tweets from 24 lines. 
11128 were referenced tweets, 10102 were duplicates.
Wrote 12923 rows and output 89 of 89 input columns in the CSV.



In [321]:
! twarc2 csv ../data/tweets/IchBinJelena.jsonl ../data/IchBinJelena.csv

100%|████████████████████████████████████████| 124k/124k [00:00<00:00, 2.14MB/s]

ℹ️
Read 146 tweets from 1 lines. 
72 were referenced tweets, 70 were duplicates.
Wrote 76 rows and output 89 of 89 input columns in the CSV.



In [322]:
! twarc2 csv ../data/tweets/IchBinMelek.jsonl ../data/IchBinMelek.csv

100%|██████████████████████████████████████| 97.6k/97.6k [00:00<00:00, 1.93MB/s]

ℹ️
Read 115 tweets from 1 lines. 
57 were referenced tweets, 56 were duplicates.
Wrote 59 rows and output 89 of 89 input columns in the CSV.



In [323]:
! twarc2 csv ../data/tweets/IchBinNichtHanna.jsonl ../data/IchBinNichtHanna.csv

100%|███████████████████████████████████████| 25.1k/25.1k [00:00<00:00, 762kB/s]

ℹ️
Read 17 tweets from 1 lines. 
7 were referenced tweets, 2 were duplicates.
Wrote 15 rows and output 89 of 89 input columns in the CSV.



## Extract conversation IDs

In [354]:
def get_conversation_IDs(src, dst, filename, chunks=False, chunksize=1000):
    data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    conversationIDs = data['conversation_id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets from {} conversations'\
              .format(filename, len(data), len(conversationIDs)))
    
    if chunks:
        N_chunks = len(conversationIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = conversationIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_ConversationIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(conversationIDs))),
                conversationIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_ConversationIDs.txt'.format(filename)),
                   conversationIDs, fmt='%d')

In [355]:
dst = '../data/conversation_IDs'

In [356]:
get_conversation_IDs(src, dst, 'IchBinHanna', chunks=False, chunksize=10)

IchBinHanna: There are 28156 Tweets from 26340 conversations


In [357]:
get_conversation_IDs(src, dst, 'FristIstFrust', chunks=False)

FristIstFrust: There are 15838 Tweets from 14945 conversations


In [358]:
get_conversation_IDs(src, dst, '95vsWissZeitVG', chunks=False)

95vsWissZeitVG: There are 14197 Tweets from 13428 conversations


In [359]:
get_conversation_IDs(src, dst, 'WissZeitVG', chunks=False)

WissZeitVG: There are 10502 Tweets from 9546 conversations


In [360]:
get_conversation_IDs(src, dst, 'ACertainDegreeOfFlexibility')

ACertainDegreeOfFlexibility: There are 12923 Tweets from 12054 conversations


In [361]:
get_conversation_IDs(src, dst, 'video_url')

video_url: There are 1473 Tweets from 1423 conversations


In [362]:
get_conversation_IDs(src, dst, 'IchBinJelena')

IchBinJelena: There are 76 Tweets from 74 conversations


In [363]:
get_conversation_IDs(src, dst, 'IchBinMelek')

IchBinMelek: There are 59 Tweets from 59 conversations


In [364]:
get_conversation_IDs(src, dst, 'IchBinNichtHanna')

IchBinNichtHanna: There are 15 Tweets from 12 conversations


## Extract Tweet IDs

In [367]:
def get_Tweet_IDs(src, dst, filename, chunks=False, chunksize=1000):
    data = pd.read_csv(join(src, '{}.csv'.format(filename)), low_memory=False)
    TweetIDs = data['id'].dropna().astype(int).unique()
    print('{}: There are {} Tweets'\
              .format(filename, len(TweetIDs)))
    
    if chunks:
        N_chunks = len(TweetIDs) // chunksize
        print(N_chunks)
        for i in range(N_chunks):
            ID_chunk = TweetIDs[i * chunksize : (i + 1) * chunksize]
            np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, i * chunksize, (i + 1) * chunksize)),
                ID_chunk, fmt='%d')
        np.savetxt(join(dst, '{}_TweetIDs_{}_to_{}.txt'\
                .format(filename, N_chunks * chunksize, len(TweetIDs))),
                TweetIDs[N_chunks * chunksize : ], fmt='%d')
            
    else:   
        np.savetxt(join(dst, '{}_TweetIDs.txt'.format(filename)),
                   TweetIDs, fmt='%d')

In [368]:
dst = '../data/tweet_IDs'

In [369]:
get_Tweet_IDs(src, dst, 'IchBinHanna', chunks=False)

IchBinHanna: There are 28156 Tweets


In [370]:
get_Tweet_IDs(src, dst, 'FristIstFrust', chunks=False)

FristIstFrust: There are 15838 Tweets


In [371]:
get_Tweet_IDs(src, dst, '95vsWissZeitVG', chunks=False)

95vsWissZeitVG: There are 14197 Tweets


In [372]:
get_Tweet_IDs(src, dst, 'WissZeitVG', chunks=False)

WissZeitVG: There are 10502 Tweets


In [373]:
get_Tweet_IDs(src, dst, 'ACertainDegreeOfFlexibility', chunks=False)

ACertainDegreeOfFlexibility: There are 12923 Tweets


In [374]:
get_Tweet_IDs(src, dst, 'video_url', chunks=False)

video_url: There are 1473 Tweets


In [375]:
get_Tweet_IDs(src, dst, 'IchBinJelena', chunks=False)

IchBinJelena: There are 76 Tweets


In [376]:
get_Tweet_IDs(src, dst, 'IchBinMelek', chunks=False)

IchBinMelek: There are 59 Tweets


In [378]:
get_Tweet_IDs(src, dst, 'IchBinNichtHanna', chunks=False)

IchBinNichtHanna: There are 15 Tweets
