In [6]:
from twarc import Twarc2
import pandas as pd
from datetime import datetime, timezone, timedelta
import matplotlib.pyplot as plt
from os.path import join
from tqdm import tqdm
from multiprocess import Pool
import itertools
import emoji_resources as er
from importlib import reload
import os

In [165]:
credentials = er.get_credentials()

# Get emoji counts

In [138]:
emojis = er.emojis
languages = er.languages
start = datetime(2019,1, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
end = datetime(2021, 11, 28, 0, 0, 0, 0, tzinfo=timezone.utc)
dst = '../data/counts'

combinations = [{'language':lang, 'string':emoji, 'bearer_token':'', 
                 'color':'yellow', "start":start, "end":end, "dst":dst}\
                              for lang in languages \
                              for emoji in emojis.values()]
credlist = list(credentials.values()) * (int(len(combinations) / len(credentials)) + 1)
for i in range(len(combinations)):
    combinations[i]['bearer_token'] = credlist[i]

In [140]:
number_of_cores = len(credentials)
pool = Pool(number_of_cores)

for counts in tqdm(pool.imap_unordered(func=er.get_counts, 
                    iterable=combinations), total=len(combinations)):
    pass

  0%|          | 0/420 [00:00<?, ?it/s]caught 503 from Twitter API, sleeping 1
caught 503 from Twitter API, sleeping 1
  1%|▏         | 6/420 [00:41<19:35,  2.84s/it]  caught 503 from Twitter API, sleeping 1
  4%|▍         | 18/420 [02:04<32:45,  4.89s/it]  caught 503 from Twitter API, sleeping 1
  6%|▌         | 24/420 [02:41<29:13,  4.43s/it]  caught 503 from Twitter API, sleeping 1
caught 503 from Twitter API, sleeping 1
  8%|▊         | 35/420 [03:48<19:10,  2.99s/it]  rate limit exceeded: sleeping 473.4118375778198 secs
  9%|▉         | 39/420 [04:23<32:51,  5.17s/it]  rate limit exceeded: sleeping 447.3930959701538 secs
 10%|█         | 44/420 [05:01<35:25,  5.65s/it]  rate limit exceeded: sleeping 409.48116540908813 secs
rate limit exceeded: sleeping 591.5193982124329 secs
 11%|█         | 46/420 [05:36<1:11:07, 11.41s/it]rate limit exceeded: sleeping 571.540109872818 secs
rate limit exceeded: sleeping 555.3751978874207 secs
 11%|█         | 47/420 [12:14<13:11:59, 127.40s/it]ra

# Get baseline counts

In [4]:
languages = er.languages
start = datetime(2019,1, 1, 0, 0, 0, 0, tzinfo=timezone.utc)
end = datetime(2021, 11, 28, 0, 0, 0, 0, tzinfo=timezone.utc)
dst = '../data/counts'

In [None]:
reload(er)
# the maximum number of returned counts seems to be around 55 mio, if there are
# more counts, the request fails
number_of_cores = len(credentials)
pool = Pool(number_of_cores * 2)

for lang in languages:
    combinations = er.create_stopword_combinations(lang, start, end, 12,
                                                   credentials, dst)
    if not os.path.exists(join(dst, lang)):
        os.mkdir(join(dst, lang))
        for counts in tqdm(pool.imap_unordered(func=er.get_counts, 
                            iterable=combinations), total=len(combinations)):
            pass

In [86]:
# some count queries fail when initially requested. Here we check the existing
# vs. the expected count data and re-query the missing counts
reload(er)
missing_counts = er.get_missing_combinations("en", start, end, 12, credentials, dst)

len(missing_counts)

19

In [100]:
# if the 12h time-window is still too large, split it in half
split_missing_counts = er.split_count_timewindow(missing_counts)

In [101]:
# the maximum number of returned counts seems to be around 55 mio, if there are
# more counts, the request fails. Therefore we split the requests for en, ja,
# pt and th
number_of_cores = len(credentials)
pool = Pool(number_of_cores * 2)

for counts in tqdm(pool.imap_unordered(func=er.get_counts, 
                    iterable=split_missing_counts), total=len(split_missing_counts)):
    pass

 32%|███▏      | 12/38 [00:03<00:04,  5.89it/s]caught 503 from Twitter API, sleeping 1
100%|██████████| 38/38 [00:12<00:00,  3.10it/s]


# Combine data

## Individual days of baseline counts

In [102]:
dst = "../data/counts"
language = "en"
files = os.listdir(join(dst, language))
files = [f for f in files if "baseline" in f]
counts = pd.DataFrame()
for f in files:
    tmp = pd.read_csv(join(dst, language, f))
    counts = pd.concat([counts, tmp])
    
counts = counts.reset_index(drop=True)
counts["end"] = pd.to_datetime(counts["end"])
counts["start"] = pd.to_datetime(counts["start"])
counts["date"] = counts["start"].apply(lambda x: x.date)
counts = counts.groupby("date").agg("sum").reset_index()
counts.to_csv(
    join(dst, lang, f"counts_language-{language}_baseline_2019-01-01-to-2021-11-28.csv"),
    index=False)

## Baseline and emoji counts

In [156]:
def check_df(df):
    expected_N = 1062
    expected_start = datetime(2019, 1, 1, 0, 0, 0, 0)
    expected_end = datetime(2021, 11, 27, 0, 0, 0, 0)
    assert len(df) == expected_N
    assert df.iloc[0]["date"] == expected_start
    assert df.iloc[-1]["date"] == expected_end

In [164]:
dst = '../data/counts'
for lang in er.languages:
    df = pd.read_csv(
        join(dst, lang, f"counts_language-{lang}_baseline_2019-01-01-to-2021-11-28.csv"),
        parse_dates = ["date"])
    df = df.sort_values(by=["date"])
    df = df.rename(columns={"tweet_count":"baseline"})
    check_df(df)
    
    for emostring, emoji in er.emojis.items():
        fname = f"counts_language-{lang}_emoji-{emoji}_color-yellow_2019-01-01 00:00:00+00:00-to-2021-11-28 00:00:00+00:00.csv"
        tmp = pd.read_csv(join(dst, lang, fname), parse_dates=["date"])
        tmp = tmp.sort_values(by=["date"])
        check_df(tmp)
        
        df[emostring] = tmp["tweet_count"]
    
    df.to_csv(join(dst, f"counts_language-{lang}_2019-01-01-to-2021-11-28.csv"),
              index=False)

In [162]:
df

Unnamed: 0,date,baseline,wavinghand,raisedhand,raisedbackhand,vulcansalute,okhand,pinchedfingers,pinchinghand,victoryhand,...,raisedfist,oncomingfist,leftfacingfist,rightfacingfist,clappinghands,raisinghands,openhands,palmsuptogether,handshake,foldedhands
0,2019-01-01,46487756,26158,14592,798,1959,65019,0,0,20679,...,73160,36157,1297,1434,105611,86409,4785,941,15795,232546
1,2019-01-02,47535219,24365,9058,556,1101,57782,0,0,19373,...,48502,36364,482,849,89014,63290,4470,522,12431,177351
2,2019-01-03,49919614,55622,7879,616,1304,67777,0,0,15976,...,55666,32000,313,720,134176,81033,4734,764,14133,176017
3,2019-01-04,50530786,19690,8492,1435,1125,80535,0,0,20784,...,68669,29375,381,440,152207,91019,5049,749,13929,159852
4,2019-01-05,48150935,26607,14137,6216,1015,64708,0,0,21154,...,56319,37311,333,348,145715,85738,4897,2897,20002,146453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057,2021-11-23,129061812,50007,49913,5439,3043,112329,7716,3720,53784,...,93489,68191,3455,2376,272232,188055,20795,12984,131826,520822
1058,2021-11-24,128499270,57763,50346,4481,6315,121080,12045,5322,51098,...,79784,52628,2702,2401,230055,178908,40126,12074,155128,750271
1059,2021-11-25,125370321,45871,43047,3170,2618,121694,9058,4864,59578,...,88336,56895,1927,2609,224465,204303,29429,12165,126554,623282
1060,2021-11-26,121528694,50897,43132,4526,3211,121615,7066,5423,49947,...,87551,48756,2439,2882,225470,220090,6948,12334,126219,602816
