In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv("./data/train.csv", encoding = 'utf-8')
test = pd.read_csv("./data/test.csv", encoding = 'utf-8')
print(train.columns)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


In [3]:
train.text.head()

0                                                                    Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1                                                                                                   Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3                                                                        13,000 people receive #wildfires evacuation orders in California 
4                                                 Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
Name: text, dtype: object

In [4]:
# Check target balance
train.target.value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [5]:
# Check target null proportion
train.target.isna().value_counts(normalize=True)

target
False    1.0
Name: proportion, dtype: float64

In [6]:
# Check null proportion of keyword and location
print(train.keyword.isna().value_counts(normalize=True))
print(train.location.isna().value_counts(normalize=True))

keyword
False    0.991987
True     0.008013
Name: proportion, dtype: float64
location
False    0.66728
True     0.33272
Name: proportion, dtype: float64


In [7]:
# Proportion of non-null keywords
train[~train.keyword.isna()].keyword.value_counts(normalize=True)

keyword
fatalities               0.005959
deluge                   0.005561
armageddon               0.005561
damage                   0.005429
body%20bags              0.005429
                           ...   
forest%20fire            0.002516
epicentre                0.001589
threat                   0.001457
inundation               0.001324
radiation%20emergency    0.001192
Name: proportion, Length: 221, dtype: float64

In [8]:
# Proportion of non-null locations
train[~train.location.isna()].location.value_counts(normalize=True)

location
USA                            0.020472
New York                       0.013976
United States                  0.009843
London                         0.008858
Canada                         0.005709
                                 ...   
Click the link below, okay     0.000197
Milwaukee County               0.000197
Gwersyllt, Wales               0.000197
Primum non nocere              0.000197
Alabama, USA                   0.000197
Name: proportion, Length: 3341, dtype: float64

### Stratified k-fold cross validation split


In [9]:
from sklearn.model_selection import StratifiedKFold

In [10]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state = 56)
X = train[['keyword','location','text']]
y = train[['target']]

In [11]:
# Create a generator from the above giving the actual dataset
def data_gen(X,y):
    # Creates a generator giving indices for each split
    for index_arr in skf.split(X,y):
        # Yields corresponding X_train, X_test, y_train, y_test
        yield X.loc[index_arr[0]],X.loc[index_arr[1]], y.loc[index_arr[0]],y.loc[index_arr[1]]

data = data_gen(X,y)

### Feature Addition and Engineering

* create another feature that extracts #words from text 
* create another feature that extracts @mention from text
* remove links from text
* remove @mention from text
* remove # symbol from text

In [17]:
def add_hashtags(text):
    
    tags = re.findall(r'#\w+',text)
    tags = ','.join([e.replace('#','') for e in tags])
    tags = tags.lower()
    
    return tags if tags != '' else np.nan

def add_mentions(text):
    
    tags = re.findall(r'@\w+',text)
    tags = ','.join([e.replace('@','') for e in tags])
    
    return tags if tags != '' else np.nan

def text_processing(text):
    
    # Remove links
    text = re.sub(r'http[s]?://\S+','',text)
    text = re.sub(r'#\w+','',text)
    text = re.sub(r'@\w+','',text)
    text = text.lower()
    
    
    return text
    
    

In [18]:
# extract hashtag words without hashtag
train['hashtags'] = train['text'].apply(add_hashtags)
# extract mention words
train['mentions'] = train['text'].apply(add_mentions)
# remove 
train['gist'] = train['text'].apply(text_processing)

In [19]:
train[~train.location.isna()]

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,gist
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C,1,,bbcmtd,wholesale markets ablaze
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT http://t.co/YAo1e0xngw,0,"metal,rt",,we always try to bring the heavy.
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi,1,africanbaze,,: breaking news:nigeria flag set ablaze in aba.
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0,,,crying out for more! set me ablaze
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N,0,,,on plus side look at the sky last night it was ablaze
...,...,...,...,...,...,...,...,...
7575,10826,wrecked,TN,On the bright side I wrecked http://t.co/uEa0txRHYs,0,,,on the bright side i wrecked
7577,10829,wrecked,#NewcastleuponTyne #UK,@widda16 ... He's gone. You can relax. I thought the wife who wrecked her cake was a goner mind lol #whoops,0,whoops,widda16,... he's gone. you can relax. i thought the wife who wrecked her cake was a goner mind lol
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty much all been wrecked hahaha shoutout to my family for that one,0,,,three days off from work and they've pretty much all been wrecked hahaha shoutout to my family for that one
7580,10832,wrecked,London,#FX #forex #trading Cramer: Iger's 3 words that wrecked Disney's stock http://t.co/7enNulLKzM,0,"fx,forex,trading",,cramer: iger's 3 words that wrecked disney's stock


In [15]:
test[test.id == 557].text

172    Mourning notices for stabbing arson victims stir Û÷politics of griefÛª in Israel: Posters for Shira Banki and A... http://t.co/WbCtkGGTY9
Name: text, dtype: object

In [16]:
'.'.join([]) == ''

True

Mourning notices for stabbing arson victims stir ¬â√õ√∑politics of grief¬â√õ¬™ in Israel: Posters for Shira Banki and A... http://t.co/WbCtkGGTY9