In [1]:
import numpy as np
import pandas as pd

import os

In [2]:
_eval_set = {
    'bombing': 'parisAttacks2015',
    'earthquake': 'philippinesEarthquake2019',
    'flood': 'manilaFloods2013',
    'shooting': 'laAirportShooting2013',
    'typhoon': 'hurricaneFlorence2018',
    'wildfire': 'albertaWildfires2019'
}

In [3]:
tweet_schema = {
    'id': np.int64,
    'id_str': np.str_,
    'in_reply_to_status_id': np.int64,
    'in_reply_to_status_id_str': np.str_,
    'in_reply_to_user_id': np.int64,
    'in_reply_to_user_id_str': np.str_,
    'quoted_status_id': np.int64,
    'quoted_status_id_str': np.int64
}

In [11]:
labels = pd.read_json('../../data/processed/train/labels/TRECIS_2018_2019-labels.jsonl',
                      lines=True, dtype=dict(postID=np.int64))

categories = pd.read_json('../../data/processed/tr')
annots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32145 entries, 0 to 32144
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   eventType              32145 non-null  object 
 1   eventID                32145 non-null  object 
 2   postID                 32145 non-null  int64  
 3   Advice                 32145 non-null  int64  
 4   CleanUp                32145 non-null  int64  
 5   ContextualInformation  32145 non-null  int64  
 6   Discussion             32145 non-null  int64  
 7   Donations              32145 non-null  int64  
 8   EmergingThreats        32145 non-null  int64  
 9   Factoid                32145 non-null  int64  
 10  FirstPartyObservation  32145 non-null  int64  
 11  GoodsServices          32145 non-null  int64  
 12  Hashtags               32145 non-null  int64  
 13  InformationWanted      32145 non-null  int64  
 14  Irrelevant             32145 non-null  int64  
 15  Lo

In [15]:
tweets = pd.read_json('../../data/raw/train/tweets/TRECIS_2018_2019-tweets.jsonl',
                      lines=True, dtype=tweet_schema)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32145 entries, 0 to 32144
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 32145 non-null  datetime64[ns, UTC]
 1   id                         32145 non-null  int64              
 2   id_str                     32145 non-null  object             
 3   full_text                  32145 non-null  object             
 4   truncated                  32145 non-null  bool               
 5   display_text_range         32145 non-null  object             
 6   entities                   32145 non-null  object             
 7   source                     32145 non-null  object             
 8   in_reply_to_status_id      2174 non-null   float64            
 9   in_reply_to_status_id_str  32145 non-null  object             
 10  in_reply_to_user_id        2878 non-null   float64            
 11  in

In [16]:
set(annots.postID) == set(tweets.id)

True

In [39]:
annot_corpus = pd.merge(tweets, annots, how='outer', 
                        left_on=tweets.id, right_on=annots.postID,
                        validate='one_to_one')

annot_corpus = annot_corpus.drop('postID', axis='columns')
if 'key_0' in annot_corpus.columns:  # Result of pd.merge (?)
    annot_corpus = annot_corpus.drop('key_0', axis='columns')
    
annot_corpus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32145 entries, 0 to 32144
Data columns (total 61 columns):
 #   Column                     Non-Null Count  Dtype              
---  ------                     --------------  -----              
 0   created_at                 32145 non-null  datetime64[ns, UTC]
 1   id                         32145 non-null  int64              
 2   id_str                     32145 non-null  object             
 3   full_text                  32145 non-null  object             
 4   truncated                  32145 non-null  bool               
 5   display_text_range         32145 non-null  object             
 6   entities                   32145 non-null  object             
 7   source                     32145 non-null  object             
 8   in_reply_to_status_id      2174 non-null   float64            
 9   in_reply_to_status_id_str  32145 non-null  object             
 10  in_reply_to_user_id        2878 non-null   float64            
 11  in

In [76]:
event_ids = annot_corpus.eventID.unique()
target_labels = annot_corpus.columns[-26:]

In [77]:
target_labels

Index(['Advice', 'CleanUp', 'ContextualInformation', 'Discussion', 'Donations',
       'EmergingThreats', 'Factoid', 'FirstPartyObservation', 'GoodsServices',
       'Hashtags', 'InformationWanted', 'Irrelevant', 'Location', 'MovePeople',
       'MultimediaShare', 'NewSubEvent', 'News', 'Official', 'OriginalEvent',
       'SearchAndRescue', 'Sentiment', 'ServiceAvailable',
       'ThirdPartyObservation', 'Volunteer', 'Weather', 'Priority'],
      dtype='object')

In [89]:
path_to_storage = '../../data/prepared/'
os.makedirs(path_to_storage, exist_ok=True)


def get_prep_data_dest_path(event_id):
    dest_path=None
    if event_id in _eval_set.values():
        dest_path = os.path.join(path_to_storage, 'test', label)
    else:
        dest_path = os.path.join(path_to_storage, 'train', label)
        
    return dest_path
        

for event_id in event_ids:
    event_data = annot_corpus[annot_corpus.eventID == event_id]
    event_type = event_data.eventType.values[0]
    
    for label in target_labels:
        
        target_label_path = get_prep_data_dest_path(event_id)
        target_file_name = f'{event_type}.{event_id}.jsonl'
               
        os.makedirs(target_label_path, exist_ok=True)
        
        _cols = list(annot_corpus.columns[:-26]) + [label]
        event_data[_cols].rename({label: 'target'}, axis='columns') \
                         .to_json(os.path.join(target_label_path, target_file_name), 
                                  orient='records', lines=True)
    
        print(os.path.join(target_label_path, target_file_name))

../../data/prepared/train/Advice/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/CleanUp/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/ContextualInformation/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Discussion/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Donations/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/EmergingThreats/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Factoid/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/FirstPartyObservation/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/GoodsServices/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Hashtags/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/InformationWanted/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Irrelevant/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/Location/typhoon.joplinTornado2011.jsonl
../../data/prepared/train/MovePeople/typhoon.joplinTornado2011.jsonl
../../data/p

../../data/prepared/train/Volunteer/earthquake.costaRicaEarthquake2012.jsonl
../../data/prepared/train/Weather/earthquake.costaRicaEarthquake2012.jsonl
../../data/prepared/train/Priority/earthquake.costaRicaEarthquake2012.jsonl
../../data/prepared/train/Advice/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/CleanUp/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/ContextualInformation/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/Discussion/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/Donations/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/EmergingThreats/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/Factoid/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/FirstPartyObservation/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/GoodsServices/earthquake.guatemalaEarthquake2012.jsonl
../../data/prepared/train/Hashtags/earthquake.guate

../../data/prepared/train/News/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Official/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/OriginalEvent/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/SearchAndRescue/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Sentiment/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/ServiceAvailable/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/ThirdPartyObservation/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Volunteer/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Weather/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Priority/bombing.westTexasExplosion2013.jsonl
../../data/prepared/train/Advice/flood.albertaFloods2013.jsonl
../../data/prepared/train/CleanUp/flood.albertaFloods2013.jsonl
../../data/prepared/train/ContextualInformation/flood.albertaFloods2013.jsonl
../../data/prepared/train/Discussion/flood.al

../../data/prepared/train/InformationWanted/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/Irrelevant/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/Location/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/MovePeople/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/MultimediaShare/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/NewSubEvent/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/News/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/Official/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/OriginalEvent/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/SearchAndRescue/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/Sentiment/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/ServiceAvailable/earthquake.earthquakeBohol2013.jsonl
../../data/prepared/train/ThirdPartyObservation/earthquake.earthquakeBohol2013.jsonl
../../data/

../../data/prepared/train/Advice/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/CleanUp/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/ContextualInformation/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Discussion/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Donations/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/EmergingThreats/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Factoid/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/FirstPartyObservation/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/GoodsServices/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Hashtags/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/InformationWanted/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Irrelevant/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/Location/typhoon.typhoonHagupit2014.jsonl
../../data/prepared/train/MovePeople/typhoon.typhoonHagupit2014.json

../../data/prepared/train/NewSubEvent/shooting.shootingDallas2017.jsonl
../../data/prepared/train/News/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Official/shooting.shootingDallas2017.jsonl
../../data/prepared/train/OriginalEvent/shooting.shootingDallas2017.jsonl
../../data/prepared/train/SearchAndRescue/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Sentiment/shooting.shootingDallas2017.jsonl
../../data/prepared/train/ServiceAvailable/shooting.shootingDallas2017.jsonl
../../data/prepared/train/ThirdPartyObservation/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Volunteer/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Weather/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Priority/shooting.shootingDallas2017.jsonl
../../data/prepared/train/Advice/shooting.flSchoolShooting2018.jsonl
../../data/prepared/train/CleanUp/shooting.flSchoolShooting2018.jsonl
../../data/prepared/train/ContextualInformation/shooting.flSchoolShoo

../../data/prepared/test/Volunteer/earthquake.philippinesEarthquake2019.jsonl
../../data/prepared/test/Weather/earthquake.philippinesEarthquake2019.jsonl
../../data/prepared/test/Priority/earthquake.philippinesEarthquake2019.jsonl
../../data/prepared/train/Advice/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/CleanUp/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/ContextualInformation/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/Discussion/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/Donations/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/EmergingThreats/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/Factoid/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/FirstPartyObservation/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/GoodsServices/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/Hashtags/typhoon.cycloneKenneth2019.jsonl
../../data/prepared/train/InformationWanted/typhoon.

../../data/prepared/test/CleanUp/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/ContextualInformation/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Discussion/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Donations/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/EmergingThreats/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Factoid/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/FirstPartyObservation/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/GoodsServices/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Hashtags/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/InformationWanted/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Irrelevant/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/Location/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/MovePeople/wildfire.albertaWildfires2019.jsonl
../../data/prepared/test/MultimediaSha

In [86]:
annot_corpus.eventType.value_counts()

earthquake    8429
typhoon       7795
wildfire      5091
shooting      4776
flood         3649
bombing       2405
Name: eventType, dtype: int64