In [1]:
import pandas as pd
df = pd.read_parquet("features.parquet")
df.target = df.target.map(bool)
df.columns

Index(['id', 'keyword', 'location', 'text', 'hashtag',
       'hashtags_have_l1_synonyms', 'hashtags_have_l2_synonyms', 'subj',
       'verb', 'obj', 'contains_l1_synonyms', 'contains_l2_synonyms',
       'contains_damaged_words', 'mentioned_news_org', 'mentioned_relief_org',
       'mentions', 'orgs', 'gpes', 'facs', 'target'],
      dtype='object')

# Feature selection

General statistics about the features:

In [2]:
num_total = df.shape[0]
def pct_with_value_for_col(column_name, df):
    def to_bool(cell):
        from numpy import ndarray as nparr
        placeholder = '_'
        
        if type(cell) == nparr:
            if len(cell) == 1 and cell.tolist() == [placeholder]:
                return False
            else:
                return True
        return bool(cell)
        
    try:
        num = df[column_name].map(to_bool).value_counts()[True]
        return round(float(num)/num_total*100, 2)
    except KeyError:
        return 0

print("Percentage of dataset with values for the specified columns:")
for col in df.columns:
    if col not in ['id', 'text', 'target']:
        print(f"Value for {col}: {pct_with_value_for_col(col, df)}%")

Percentage of dataset with values for the specified columns:
Value for keyword: 100.0%
Value for location: 66.92%
Value for hashtag: 22.44%
Value for hashtags_have_l1_synonyms: 0.12%
Value for hashtags_have_l2_synonyms: 0.03%
Value for subj: 52.9%
Value for verb: 79.23%
Value for obj: 79.39%
Value for contains_l1_synonyms: 12.58%
Value for contains_l2_synonyms: 5.1%
Value for contains_damaged_words: 8.0%
Value for mentioned_news_org: 0.95%
Value for mentioned_relief_org: 0.04%
Value for mentions: 25.4%
Value for orgs: 59.72%
Value for gpes: 15.19%
Value for facs: 1.73%


Is the location being present an indication of the tweet relating to a disaster?
Not really.

In [15]:
import plotnine as p9
    
combinations = [['positive, has location', 'target == 1 and location != ""'], 
                ['positive, no location', 'target == 1 and location == ""'],
                ['negative, has location', 'target == 0 and location != ""'], 
                ['negative, no location', 'target == 0 and location == ""']]

(p9.ggplot(pd.DataFrame({'features': [x[0] for x in combinations],
                        'count': [df.query(x[1]).shape[0] for x in combinations]},
                       columns=['features', 'count']))
 + p9.aes(x='features', y='count')
 + p9.geom_col(fill='cornflowerblue')
 + p9.ylab("number of instances")
 + p9.xlab("label vs location present")
 + p9.coord_flip()
)



For the few rows that mention news org, does it mean they're a disaster tweet?

In [17]:
(p9.ggplot(df[df.mentioned_news_org])
 + p9.aes(x='target')
 + p9.xlab('label')
 + p9.ylab('number of instances')
 + p9.geom_bar(fill='cornflowerblue')
)



Same with relief org, if they mention one, is it a disaster tweet?

In [16]:
(p9.ggplot(df[df.mentioned_relief_org])
 + p9.aes(x='target')
 + p9.xlab('label')
 + p9.ylab('number of instances')
 + p9.geom_bar(fill='cornflowerblue')
)



What about the correlation between containing a synonym for 'disaster' and being a tweet _about_ a disaster?

In [18]:
(p9.ggplot(df[df.contains_l1_synonyms])
 + p9.aes(x='target')
 + p9.xlab('label')
 + p9.ylab('number of instances')
 + p9.geom_bar(fill='cornflowerblue')
)



In [19]:
(p9.ggplot(df[df.contains_l2_synonyms])
 + p9.aes(x='target')
 + p9.xlab('label')
 + p9.ylab('number of instances')
 + p9.geom_bar(fill='cornflowerblue')
)



In [20]:
(p9.ggplot(df[df.contains_damaged_words])
 + p9.aes(x='target')
 + p9.xlab('label')
 + p9.ylab('number of instances')
 + p9.geom_bar(fill='cornflowerblue')
)



In [21]:
plotdf = pd.DataFrame(columns=['num_ents', 'target'])
plotdf.num_ents = df.apply(lambda x: len(x.orgs)+len(x.gpes)+len(x.facs), axis='columns')
plotdf.target = df.target

(p9.ggplot(plotdf)
 + p9.aes(x='num_ents', fill='target')
 + p9.geom_bar()
 + p9.scale_x_continuous(breaks=range(0,plotdf.num_ents.max()+1))
 + p9.xlab('number of entities')
 + p9.ylab('number of instances')
 + p9.coord_flip()
)




It looks a _bit_ like the disaster tweets have more entities.

What about the number of hashtags?

In [24]:
plotdf = pd.DataFrame(columns=['num_hashtags', 'target'])
plotdf.num_hashtags = df.apply(lambda x: len(x.hashtag), axis='columns')
plotdf.target = df.target

In [25]:
(p9.ggplot(plotdf)
 + p9.aes(x='num_hashtags', fill='target')
 + p9.geom_bar()
 + p9.xlab('number of hashtags')
 + p9.ylab('number of instances')
 + p9.scale_x_continuous(breaks=range(0,plotdf.num_hashtags.max()+1))
 + p9.coord_flip()
)



In [26]:
(p9.ggplot(plotdf[plotdf.num_hashtags >= 4])
  + p9.aes(x='num_hashtags', fill='target')
  + p9.geom_bar()
  + p9.xlab('number of hashtags')
  + p9.ylab('number of instances')
  + p9.scale_x_continuous(breaks=range(4,plotdf.num_hashtags.max()+1))
  + p9.coord_flip()
)



It looks like disaster tweets seem to have more hashtags in general. Though it's a bit hard to say, because e.g. 9 and 11 hashtags means that the tweet is not about a disaster, for some reason.