In [2]:
import pandas as pd

### 1. Loading of data and initial data exploration

In [3]:
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [4]:
print(f"There are {train.shape[0]} entries in the train set")
print(f"There are {test.shape[0]} entries in the test set")
print(f"There are {test.shape[0] + train.shape[0]} total entries")


There are 7613 entries in the train set
There are 3263 entries in the test set
There are 10876 total entries


In [5]:
print(f"There are {sum(train.isna().keyword)} NAs in the keyword entries ")
print(f"There are {sum(train.isna().location)} NAs in the location entries ")


There are 61 NAs in the keyword entries 
There are 2533 NAs in the location entries 


In [6]:
num_locs = train.location.unique()
num_keywords = train.keyword.unique()
print(f"There are {len(num_locs) - 1} distinct locations")
print(f"There are {len(num_keywords) - 1} distinct keywords")


There are 3341 distinct locations
There are 221 distinct keywords


In [7]:
train.location.value_counts()[:10]

USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
Name: location, dtype: int64

In [8]:
train.keyword.value_counts()[:10]

fatalities     45
deluge         42
armageddon     42
sinking        41
damage         41
harm           41
body%20bags    41
outbreak       40
evacuate       40
fear           40
Name: keyword, dtype: int64

Some initial thoughts:
 - Keywords seem to be somewhat more useful. 
 - I see many links and many tags as well, maybe the dataset should be expanded a little bit to include information about those as well.

In [9]:
train.text[3345]

'Our thoughts are with these local residents! Time for some heavy rain!!! http://t.co/x3g2OX6K8R'

### 2. Feature Engineering

In [10]:
import re

#### 2.1 Isolate the links

In [11]:
# count how many valid links exist
a = [1 if "http://" in text else 0 for text in train.text]
b = [1 if "#" in text else 0 for text in train.text]
c = [1 if "@" in text else 0 for text in train.text]
print(f'There are {sum(a)} texts with links, {sum(b)} text with hashtags, and {sum(c)} texts with nametags')


There are 3604 texts with links, 1761 text with hashtags, and 2039 texts with nametags


In [12]:
test = "Abla laa @ #13333 http:/"

pattern_link = re.compile(r'http://\w+')
pattern_hashtag = re.compile(r'#\w+')
pattern_nametag = re.compile(r'@\w+')
match = re.search(pattern_hashtag,test)
# match = re.findall(pattern_hashtag,test)
# match.span()
print(test[0:match.span()[0]] + test[match.span()[1]:])
# len(match)


Abla laa @  http:/


In [13]:
train = pd.read_csv("data/train.csv")
train[['text_clean','hashtag','nametag','URL']] = ''

In [14]:
pattern_link = re.compile(r'http://\S+')
pattern_hashtag = re.compile(r'#\S+')
pattern_nametag = re.compile(r'@\S+')

for i, rows in train.iterrows():
    
    text = rows.text
    
    # Deal with URLs
    if len(re.findall(pattern_link, text)) > 0:
        match = re.search(pattern_link,text)
        train.loc[i,'URL'] = text[match.span()[0]:match.span()[1]]
        train.loc[i,'text_clean'] = text[0:match.span()[0]] + text[match.span()[1]:]
    else:
        train.loc[i,'URL'] = "no"
        train.loc[i,'text_clean'] = train.loc[i,'text_clean']
    
    # Deal with hashtags
    if len(re.findall(pattern_hashtag, text)) > 0:
        match = re.search(pattern_hashtag,text)
        train.loc[i,'hashtag'] = text[match.span()[0]:match.span()[1]]
    else:
        train.loc[i,'hashtag'] = "no"
    
    # Deal with nametags
    if len(re.findall(pattern_nametag, text)) > 0:
        match = re.search(pattern_nametag,text)
        train.loc[i,'nametag'] = text[match.span()[0]:match.span()[1]]
    else:
        train.loc[i,'nametag'] = "no"
    

In [15]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,hashtag,nametag,URL
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,,#earthquake,no,no
1,4,,,Forest fire near La Ronge Sask. Canada,1,,no,no,no
2,5,,,All residents asked to 'shelter in place' are ...,1,,no,no,no
3,6,,,"13,000 people receive #wildfires evacuation or...",1,,#wildfires,no,no
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,,#Alaska,no,no


In [16]:
filter = train.URL != 'no'
print(train[filter].shape)

filter = train.hashtag != 'no'
print(train[filter].shape)

filter = train.nametag != 'no'
print(train[filter].shape)

(3604, 9)
(1756, 9)
(2018, 9)


In [17]:
# we are still losing a couple of hashtags
# the one is missing because it is randomly in the end, one represents number, one is
# randomly at start and two go with a phone number


train_filt = train[train.hashtag == "no"]
train_filt_2 = train_filt[[True if "#" in text else False for text in train_filt.text]]
train_filt_2.text.iloc[4]

'Beat:B2 MOTOR VEHICLE COLLISION at N 35 ST / FREMONT AV N reported on 8/5/2015 6:52 PM Call# 15000270364'

### 3. Models

In [22]:
import sklearn

#### 3.1 Logistic regression classifier

Let's start with a simple model and let's initially build a simple model that has the keyword, location and text_clean.

In [18]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_clean,hashtag,nametag,URL
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,,#earthquake,no,no
1,4,,,Forest fire near La Ronge Sask. Canada,1,,no,no,no
2,5,,,All residents asked to 'shelter in place' are ...,1,,no,no,no
3,6,,,"13,000 people receive #wildfires evacuation or...",1,,#wildfires,no,no
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,,#Alaska,no,no


In [24]:
X = train.text
y = train.target

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
X_counts.shape

(7613, 21637)