In [1]:
import numpy as np
import pandas as pd

print(pd.__version__)

1.0.3


In [2]:
tweets_path = '../../data/raw/train/tweets/TRECIS_2018_2019-tweets.jsonl'

partial_schema = {
    'id': np.int64,
    'id_str': np.str_,
    'in_reply_to_status_id': np.int64,
    'in_reply_to_status_id_str': np.str_,
    'in_reply_to_user_id': np.int64,
    'in_reply_to_user_id_str': np.str_,
    'quoted_status_id': np.int64,
    'quoted_status_id_str': np.int64
}
tweets = pd.read_json(tweets_path, lines=True, dtype=partial_schema)

tweets.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31960 entries, 0 to 31959
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 31960 non-null  datetime64[ns, UTC]
 1   id                         31960 non-null  int64              
 2   id_str                     31960 non-null  object             
 3   full_text                  31960 non-null  object             
 4   truncated                  31960 non-null  bool               
 5   display_text_range         31960 non-null  object             
 6   entities                   31960 non-null  object             
 7   source                     31960 non-null  object             
 8   in_reply_to_status_id      2169 non-null   float64            
 9   in_reply_to_status_id_str  31960 non-null  object             
 10  in_reply_to_user_id        2872 non-null   float64            
 11  in

In [3]:
labels_path = '../../data/processed/train/labels/TRECIS_2018_2019-labels.jsonl'
labels = pd.read_json(labels_path, lines=True, dtype={'postID':np.str_})

labels.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31960 entries, 0 to 31959
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   eventType              31960 non-null  object 
 1   eventID                31960 non-null  object 
 2   postID                 31960 non-null  object 
 3   Advice                 31960 non-null  int64  
 4   CleanUp                31960 non-null  int64  
 5   ContextualInformation  31960 non-null  int64  
 6   Discussion             31960 non-null  int64  
 7   Donations              31960 non-null  int64  
 8   EmergingThreats        31960 non-null  int64  
 9   Factoid                31960 non-null  int64  
 10  FirstPartyObservation  31960 non-null  int64  
 11  GoodsServices          31960 non-null  int64  
 12  Hashtags               31960 non-null  int64  
 13  InformationWanted      31960 non-null  int64  
 14  Irrelevant             31960 non-null  int64  
 15  Lo

In [4]:
from pandas.testing import assert_series_equal
assert_series_equal(tweets.id.astype(str), tweets.id_str, check_names=False)

In [5]:
annot_corpus = pd.merge(tweets, labels, left_on='id_str', right_on='postID', 
                        how='outer', validate='1:1')

print(len(tweets))
print(len(labels))
print(len(annot_corpus))

31960
31960
31960


### `lang`

In [6]:
tweets.lang.value_counts()

en    31960
Name: lang, dtype: int64

In [7]:
lang_en = tweets.lang == 'en'

overall_en_proportion = len(tweets[lang_en]) / len(tweets)

print('Overall proportion of english tweets ', overall_en_proportion)
print(len(tweets[~lang_en]))

Overall proportion of english tweets  0.9937807077554575
200


In [8]:
tweets_lang = pd.DataFrame(columns=['en', 'other'])

_groupby = annot_corpus.groupby('eventID')
for event in _groupby.groups:
    event_tweets = _groupby.get_group(event)
    tweets_lang.loc[event, 'en'] = event_tweets[event_tweets.lang == 'en'].shape[0]
    tweets_lang.loc[event, 'other'] = event_tweets.shape[0] - tweets_lang.loc[event, 'en']
    tweets_lang.loc[event, 'proportion_en'] = tweets_lang.loc[event, 'en'] / len(event_tweets)
    tweets_lang.loc[event, 'diff_in_%'] = tweets_lang.loc[event, 'proportion_en'] - overall_en_proportion
    
print(tweets_lang)

                                 en other  proportion_en  diff_in_%
albertaFloods2013               647     0       1.000000   0.006219
albertaWildfires2019           1860     0       1.000000   0.006219
australiaBushfire2013           580     0       1.000000   0.006219
bostonBombings2013              449     0       1.000000   0.006219
chileEarthquake2014             295     4       0.986622  -0.007159
coloradoStemShooting2019       1003     0       1.000000   0.006219
costaRicaEarthquake2012         217     1       0.995413   0.001632
cycloneKenneth2019             1808     0       1.000000   0.006219
earthquakeBohol2013             382    86       0.816239  -0.177541
earthquakeCalifornia2014        124     0       1.000000   0.006219
fireColorado2012                239     0       1.000000   0.006219
fireYMM2016                    2414     0       1.000000   0.006219
flSchoolShooting2018            801    42       0.950178  -0.043603
floodChoco2019                  325     0       

In [9]:
tweets_lang = pd.DataFrame(columns=['en', 'other'])

_groupby = annot_corpus.groupby('eventType')
for event in _groupby.groups:
    event_tweets = _groupby.get_group(event)
    tweets_lang.loc[event, 'en'] = event_tweets[event_tweets.lang == 'en'].shape[0]
    tweets_lang.loc[event, 'other'] = event_tweets.shape[0] - tweets_lang.loc[event, 'en']
    tweets_lang.loc[event, 'proportion_en'] = tweets_lang.loc[event, 'en'] / len(event_tweets)
    tweets_lang.loc[event, 'diff_in_%'] = tweets_lang.loc[event, 'proportion_en'] - overall_en_proportion
    
print(tweets_lang)

              en other  proportion_en  diff_in_%
bombing     2395     7       0.997086   0.003305
earthquake  8325   106       0.987427  -0.006353
flood       3657     3       0.999180   0.005400
shooting    4739    42       0.991215  -0.002565
typhoon     7749    42       0.994609   0.000828
wildfire    5093     0       1.000000   0.006219
