In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv("./data/train.csv", encoding = 'utf-8')
test = pd.read_csv("./data/test.csv", encoding = 'utf-8')
print(test.columns)

Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [3]:
# Check target balance
train.target.value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [4]:
# Check target null proportion
train.target.isna().value_counts(normalize=True)

target
False    1.0
Name: proportion, dtype: float64

In [5]:
# Check null proportion of keyword and location
print(train.keyword.isna().value_counts(normalize=True))
print(train.location.isna().value_counts(normalize=True))

keyword
False    0.991987
True     0.008013
Name: proportion, dtype: float64
location
False    0.66728
True     0.33272
Name: proportion, dtype: float64


In [6]:
# Proportion of non-null keywords
train[~train.keyword.isna()].keyword.value_counts(normalize=True)

keyword
fatalities               0.005959
deluge                   0.005561
armageddon               0.005561
sinking                  0.005429
damage                   0.005429
                           ...   
forest%20fire            0.002516
epicentre                0.001589
threat                   0.001457
inundation               0.001324
radiation%20emergency    0.001192
Name: proportion, Length: 221, dtype: float64

In [7]:
# Proportion of non-null locations
train[~train.location.isna()].location.value_counts(normalize=True)

location
USA                    0.020472
New York               0.013976
United States          0.009843
London                 0.008858
Canada                 0.005709
                         ...   
MontrÌ©al, QuÌ©bec     0.000197
Montreal               0.000197
ÌÏT: 6.4682,3.18287    0.000197
Live4Heed??            0.000197
Lincoln                0.000197
Name: proportion, Length: 3341, dtype: float64

### Stratified k-fold cross validation split


In [8]:
from sklearn.model_selection import StratifiedKFold

In [9]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state = 56)
X = train[['id','keyword','location','text']]
y = train[['target']]

In [10]:
# Create a generator from the above giving the actual dataset
def data_gen(X,y):
    # Creates a generator giving indices for each split
    for index_arr in skf.split(X,y):
        # Yields corresponding X_train, X_test, y_train, y_test
        yield X.loc[index_arr[0]],X.loc[index_arr[1]], y.loc[index_arr[0]],y.loc[index_arr[1]]

data = data_gen(X,y)

### Feature Addition and Engineering

* create another feature that extracts #words from text 
* create another feature that extracts @mention from text
* remove links from text
* remove @mention from text
* remove # symbol from text

In [11]:
class PreProcess():
    
    
    def add_hashtags(text):

        tags = re.findall(r'#\w+',text)
        tags = ','.join([e.replace('#','') for e in tags])
        tags = tags.lower()

        return tags if tags != '' else np.nan

    def add_mentions(text):

        tags = re.findall(r'@\w+',text)
        tags = ','.join([e.replace('@','') for e in tags])

        return tags if tags != '' else np.nan

    def text_processing(text):

        # Remove links
        text = re.sub(r'http[s]?://\S+','',text)
        #text = re.sub(r'#\w+','',text)
        text = text.replace('#','')
        text = re.sub(r'@\w+','',text)
        # Remove non ascii characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        text = text.lower()

        return text
    
    def get_places(text):
        
        doc = nlp(text)
        ents = [ent.text for ent in doc.ents if ent.label_ in ['GPE']]
        ents = ','.join(ents)
     
        
        return ents if ents != '' else np.nan
    
    def get_events(text):
        
        doc = nlp(text)
        ents = [ent.text for ent in doc.ents if ent.label_ in ['EVENT']]
        ents = ','.join(ents)
     
        
        return ents if ents != '' else np.nan

    def preprocess(X):
        
        nlp = spacy.load('en_core_web_sm')
        X_copy = X.copy()
        X_copy['hashtags'] = X_copy['text'].apply(PreProcess.add_hashtags)
        X_copy['mentions'] = X_copy['text'].apply(PreProcess.add_mentions)
        X_copy['gist'] = X_copy['text'].apply(PreProcess.text_processing)
        X_copy['places'] = X_copy['gist'].apply(PreProcess.get_places)
        #X_copy['events'] = X_copy['gist'].apply(PreProcess.get_events)
        
        return X_copy
    

### Spacy analysis

In [12]:
import spacy

nlp = spacy.load('en_core_web_md')

In [13]:
train_pro = PreProcess.preprocess(train)

In [17]:
train_pro[~train_pro.events.isna()]

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,gist,places,events
692,997,blazing,Your screen,S3XLEAK!!!\nPh0tos of 19yrs old Ash@wo lady in Festac town from Delta exp0sed on BBM 5 leaked pictures... http://t.co/ixREhM05yq,0,,wo,s3xleak!!!\nph0tos of 19yrs old ash lady in festac town from delta exp0sed on bbm 5 leaked pictures...,delta,bbm 5
3064,4396,earthquake,world,Earthquake : M 3.4 - 96km N of Brenas Puerto Rico: Time2015-08-05 10:34:24 UTC2015-08-05 06:34:24 -04:00 atÛ_ http://t.co/sDZrrfZhMy,1,,,earthquake : m 3.4 - 96km n of brenas puerto rico: time2015-08-05 10:34:24 utc2015-08-05 06:34:24 -04:00 at_,brenas puerto rico,utc2015-08-05
3308,4739,evacuate,,The U.S. also flew over each bomb site in World War II with warning letters telling people to evacuate,1,,,the u.s. also flew over each bomb site in world war ii with warning letters telling people to evacuate,u.s.,world war ii
3367,4823,evacuation,,VIETNAM WAR PATCH US 71st EVACUATION HOSPITAL HIGHLAND MEDICS http://t.co/kIF7M3FQLx http://t.co/Oz6vlWwTNR,1,,,vietnam war patch us 71st evacuation hospital highland medics,,vietnam war
4508,6406,hurricane,??? ??? ????? ??? ???.,Be careful during hurricane season ???? https://t.co/bFtOU2nybW,1,,,be careful during hurricane season ????,,hurricane season
4516,6417,hurricane,,AngelRiveraLibÛ_ #Snowden 'may have' broken laws? Hurricane Katrina may have caused some damage. http://t.co/jAaWuiOvdc Without Snowden hÛ_,1,snowden,,angelriveralib_ snowden 'may have' broken laws? hurricane katrina may have caused some damage. without snowden h_,,hurricane katrina
4525,6430,hurricane,NYC,Mr. T stopped wearing gold chains in 2005 because he thought it would be an insult to the people who lost everything after Hurricane Katrina,0,,,mr. t stopped wearing gold chains in 2005 because he thought it would be an insult to the people who lost everything after hurricane katrina,,hurricane katrina
4762,6775,lightning,,World War II book LIGHTNING JOE An Autobiography by General J. Lawton Collins http://t.co/BzdfznKvoG http://t.co/eRhdH37rDh,0,,,world war ii book lightning joe an autobiography by general j. lawton collins,,world war ii
4770,6788,lightning,,World War II book LIGHTNING JOE An Autobiography by General J. Lawton Collins http://t.co/R4khEH7iaf http://t.co/qSZgJfUutu,1,,,world war ii book lightning joe an autobiography by general j. lawton collins,,world war ii


In [15]:
text = 'Our deeds are the reason for this earthquaqe near Apple and Google office'
doc = nlp(text)
ls = list(doc.ents)
string = ''.join(ls)

TypeError: sequence item 0: expected str instance, spacy.tokens.span.Span found

In [None]:
doc = nlp(text)

In [None]:
doc.ents

In [None]:
ls = [ent.text for ent in doc.ents]

In [None]:
','.join(ls)