In [1]:
import numpy as np
import pandas as pd

print(pd.__version__)

1.0.5


In [2]:
tweets_path = '../../data/raw/tweets/TRECIS_2018_2019-tweets.jsonl'

partial_schema = {
    'id': np.int64,
    'id_str': np.str_,
    'in_reply_to_status_id': np.int64,
    'in_reply_to_status_id_str': np.str_,
    'in_reply_to_user_id': np.int64,
    'in_reply_to_user_id_str': np.str_,
    'quoted_status_id': np.int64,
    'quoted_status_id_str': np.int64
}
tweets = pd.read_json(tweets_path, lines=True, dtype=partial_schema)

tweets.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32145 entries, 0 to 32144
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 32145 non-null  datetime64[ns, UTC]
 1   id                         32145 non-null  int64              
 2   id_str                     32145 non-null  object             
 3   full_text                  32145 non-null  object             
 4   truncated                  32145 non-null  bool               
 5   display_text_range         32145 non-null  object             
 6   entities                   32145 non-null  object             
 7   source                     32145 non-null  object             
 8   in_reply_to_status_id      2174 non-null   float64            
 9   in_reply_to_status_id_str  32145 non-null  object             
 10  in_reply_to_user_id        2878 non-null   float64            
 11  in

In [6]:
labels_path = '../../data/processed/TRECIS_2018_2019-labels.jsonl'
labels = pd.read_json(labels_path, lines=True, dtype={'postID':np.str_})

labels.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32145 entries, 0 to 32144
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   eventType              32145 non-null  object 
 1   eventID                32145 non-null  object 
 2   postID                 32145 non-null  object 
 3   Advice                 32145 non-null  int64  
 4   CleanUp                32145 non-null  int64  
 5   ContextualInformation  32145 non-null  int64  
 6   Discussion             32145 non-null  int64  
 7   Donations              32145 non-null  int64  
 8   EmergingThreats        32145 non-null  int64  
 9   Factoid                32145 non-null  int64  
 10  FirstPartyObservation  32145 non-null  int64  
 11  GoodsServices          32145 non-null  int64  
 12  Hashtags               32145 non-null  int64  
 13  InformationWanted      32145 non-null  int64  
 14  Irrelevant             32145 non-null  int64  
 15  Lo

In [9]:
from pandas.testing import assert_series_equal

assert_series_equal(tweets.id.astype(str), tweets.id_str, check_names=False)

In [10]:
annot_corpus = pd.merge(tweets, labels, left_on='id_str', right_on='postID', 
                        how='outer', validate='1:1')

print("Total number of Tweets", len(tweets))
print("Total number of labels", len(labels))
print("Total number of merged Tweets and labels", len(annot_corpus))

Total number of Tweets 32145
Total number of labels 32145
Total number of merged Tweets and labels 32145


### `lang`

In [11]:
tweets.lang.value_counts()

en     31945
tl        73
und       51
es        40
hi         7
pt         6
fr         5
in         4
ja         3
it         2
nl         2
pl         1
eu         1
lt         1
tr         1
ca         1
cy         1
et         1
Name: lang, dtype: int64

In [14]:
lang_en = tweets.lang == 'en'

overall_en_proportion = len(tweets[lang_en]) / len(tweets)

print('Overall proportion of english tweets ', overall_en_proportion)
print('Total number of non-english Tweets', len(tweets[~lang_en]))

Overall proportion of english tweets  0.9937781925649402
Total number of non-english Tweets 200


In [15]:
tweets_lang = pd.DataFrame(columns=['en', 'other'])

_groupby = annot_corpus.groupby('eventID')
for event in _groupby.groups:
    event_tweets = _groupby.get_group(event)
    tweets_lang.loc[event, 'en'] = event_tweets[event_tweets.lang == 'en'].shape[0]
    tweets_lang.loc[event, 'other'] = event_tweets.shape[0] - tweets_lang.loc[event, 'en']
    tweets_lang.loc[event, 'proportion_en'] = tweets_lang.loc[event, 'en'] / len(event_tweets)
    tweets_lang.loc[event, 'diff_in_%'] = tweets_lang.loc[event, 'proportion_en'] - overall_en_proportion
    
print(tweets_lang)

                                 en other  proportion_en  diff_in_%
albertaFloods2013               646     0       1.000000   0.006222
albertaWildfires2019           1858     0       1.000000   0.006222
australiaBushfire2013           579     0       1.000000   0.006222
bostonBombings2013              451     0       1.000000   0.006222
chileEarthquake2014             296     4       0.986667  -0.007112
coloradoStemShooting2019       1004     0       1.000000   0.006222
costaRicaEarthquake2012         216     1       0.995392   0.001614
cycloneKenneth2019             1811     0       1.000000   0.006222
earthquakeBohol2013             381    86       0.815846  -0.177932
earthquakeCalifornia2014        124     0       1.000000   0.006222
fireColorado2012                239     0       1.000000   0.006222
fireYMM2016                    2415     0       1.000000   0.006222
flSchoolShooting2018            801    42       0.950178  -0.043600
floodChoco2019                  325     0       

In [16]:
tweets_lang = pd.DataFrame(columns=['en', 'other'])

_groupby = annot_corpus.groupby('eventType')
for event in _groupby.groups:
    event_tweets = _groupby.get_group(event)
    tweets_lang.loc[event, 'en'] = event_tweets[event_tweets.lang == 'en'].shape[0]
    tweets_lang.loc[event, 'other'] = event_tweets.shape[0] - tweets_lang.loc[event, 'en']
    tweets_lang.loc[event, 'proportion_en'] = tweets_lang.loc[event, 'en'] / len(event_tweets)
    tweets_lang.loc[event, 'diff_in_%'] = tweets_lang.loc[event, 'proportion_en'] - overall_en_proportion
    
print(tweets_lang)

              en other  proportion_en  diff_in_%
bombing     2398     7       0.997089   0.003311
earthquake  8323   106       0.987424  -0.006354
flood       3646     3       0.999178   0.005400
shooting    4734    42       0.991206  -0.002572
typhoon     7753    42       0.994612   0.000834
wildfire    5091     0       1.000000   0.006222
