In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv("./data/train.csv", encoding = 'utf-8')
test = pd.read_csv("./data/test.csv", encoding = 'utf-8')
print(test.columns)

Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [3]:
# Check target balance
train.target.value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [4]:
# Check target null proportion
train.target.isna().value_counts(normalize=True)

target
False    1.0
Name: proportion, dtype: float64

In [5]:
# Check null proportion of keyword and location
print(train.keyword.isna().value_counts(normalize=True))
print(train.location.isna().value_counts(normalize=True))

keyword
False    0.991987
True     0.008013
Name: proportion, dtype: float64
location
False    0.66728
True     0.33272
Name: proportion, dtype: float64


In [6]:
# Proportion of non-null keywords
train[~train.keyword.isna()].keyword.value_counts(normalize=True)

keyword
fatalities               0.005959
deluge                   0.005561
armageddon               0.005561
sinking                  0.005429
damage                   0.005429
                           ...   
forest%20fire            0.002516
epicentre                0.001589
threat                   0.001457
inundation               0.001324
radiation%20emergency    0.001192
Name: proportion, Length: 221, dtype: float64

In [7]:
# Proportion of non-null locations
train[~train.location.isna()].location.value_counts(normalize=True)

location
USA                    0.020472
New York               0.013976
United States          0.009843
London                 0.008858
Canada                 0.005709
                         ...   
MontrÌ©al, QuÌ©bec     0.000197
Montreal               0.000197
ÌÏT: 6.4682,3.18287    0.000197
Live4Heed??            0.000197
Lincoln                0.000197
Name: proportion, Length: 3341, dtype: float64

### Stratified k-fold cross validation split


In [8]:
from sklearn.model_selection import StratifiedKFold

In [9]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state = 56)
X = train[['id','keyword','location','text']]
y = train[['target']]

In [10]:
# Create a generator from the above giving the actual dataset
def data_gen(X,y):
    # Creates a generator giving indices for each split
    for index_arr in skf.split(X,y):
        # Yields corresponding X_train, X_test, y_train, y_test
        yield X.loc[index_arr[0]],X.loc[index_arr[1]], y.loc[index_arr[0]],y.loc[index_arr[1]]

data = data_gen(X,y)

### Feature Addition and Engineering

* create another feature that extracts #words from text 
* create another feature that extracts @mention from text
* remove links from text
* remove @mention from text
* remove # symbol from text

In [11]:
class PreProcess():
    
    
    def add_hashtags(text):

        tags = re.findall(r'#\w+',text)
        tags = ','.join([e.replace('#','') for e in tags])
        tags = tags.lower()

        return tags if tags != '' else np.nan

    def add_mentions(text):

        tags = re.findall(r'@\w+',text)
        tags = ','.join([e.replace('@','') for e in tags])

        return tags if tags != '' else np.nan

    def text_processing(text):

        # Remove links
        text = re.sub(r'http[s]?://\S+','',text)
        #text = re.sub(r'#\w+','',text)
        text = text.replace('#','')
        text = re.sub(r'@\w+','',text)
        # Remove non ascii characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        text = text.lower()

        return text
    
    def get_places(text):
        
        doc = nlp(text)
        ents = [ent.text for ent in doc.ents if ent.label_ in ['GPE']]
        ents = ','.join(ents)
     
        
        return ents if ents != '' else np.nan
    
    def get_events(text):
        
        doc = nlp(text)
        ents = [ent.text for ent in doc.ents if ent.label_ in ['EVENT']]
        ents = ','.join(ents)
     
        
        return ents if ents != '' else np.nan

    def preprocess(X):
        
        nlp = spacy.load('en_core_web_sm')
        X_copy = X.copy()
        X_copy['hashtags'] = X_copy['text'].apply(PreProcess.add_hashtags)
        X_copy['mentions'] = X_copy['text'].apply(PreProcess.add_mentions)
        X_copy['gist'] = X_copy['text'].apply(PreProcess.text_processing)
        X_copy['places'] = X_copy['gist'].apply(PreProcess.get_places)
        #X_copy['events'] = X_copy['gist'].apply(PreProcess.get_events)
        
        return X_copy
    

### Spacy analysis

* Train spacy ner to classify disasters as a new rule

In [12]:
import spacy

nlp = spacy.load('en_core_web_md')

In [13]:
train_pro = PreProcess.preprocess(train)

In [14]:
train_pro

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,gist,places
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,earthquake,,our deeds are the reason of this earthquake may allah forgive us all,
1,4,,,Forest fire near La Ronge Sask. Canada,1,,,forest fire near la ronge sask. canada,canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,,,all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected,
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,wildfires,,"13,000 people receive wildfires evacuation orders in california",california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,"alaska,wildfires",,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,alaska
...,...,...,...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,,,two giant cranes holding a bridge collapse into nearby homes,
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1,,"aria_ahrary,TheTawniest",the out of control wild fires in california even in the northern part of the state. very troubling.,california
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,,,m1.94 [01:04 utc]?5km s of volcano hawaii.,hawaii
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1,,,police investigating after an e-bike collided with a car in little portugal. e-bike rider suffered serious non-life threatening injuries.,portugal


In [15]:
text = 'Our deeds are the reason for this earthquaqe near Apple and Google office'
doc = nlp(text)
ls = list(doc.ents)
string = ''.join(ls)

TypeError: sequence item 0: expected str instance, spacy.tokens.span.Span found

In [None]:
doc = nlp(text)

In [None]:
doc.ents

In [None]:
ls = [ent.text for ent in doc.ents]

In [None]:
','.join(ls)