# Creating Classification Model

### Labeling text message data

### Training a classifier on labeled text message data

### Using the classifier to predict the urgency of tweet data

### Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix
    
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Reading in Data

In [6]:
train = pd.read_csv('datasets/disaster_response_messages_training.csv')
test = pd.read_csv('datasets/disaster_response_messages_test.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
train.head();

## Initial EDA

In [4]:
train.shape

(21046, 42)

In [5]:
train.columns;

Index(['id', 'split', 'message', 'original', 'genre', 'related', 'PII',
       'request', 'offer', 'aid_related', 'medical_help', 'medical_products',
       'search_and_rescue', 'security', 'military', 'child_alone', 'water',
       'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees',
       'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

In [1]:
# Checking column distributions

# for i in train.columns[4:]:
    #print(i)
    #print(train[i].value_counts(normalize = True))
    #print('')

In [7]:
# Dropping columns with heavily skewed distributions

train = train.drop(columns = ['id', 'split', 'original', 'tools', 'child_alone', 'PII'])

In [8]:
train.columns[3:35]

Index(['request', 'offer', 'aid_related', 'medical_help', 'medical_products',
       'search_and_rescue', 'security', 'military', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather'],
      dtype='object')

#### Creating a target variable

Based on the presence of certain features and a message being direct (not news)

In [9]:
need_help = []
for row in range(train.shape[0]):
    val = 0
    for i in train.columns[3:35]:
        if train[i][row] == 1 and train['genre'][row] == 'direct':
            val = 1
    need_help.append(val)
print(len(need_help))
    

21046


In [10]:
train['need_help'] = need_help

In [11]:
train['need_help'].value_counts(normalize = True)

0    0.804761
1    0.195239
Name: need_help, dtype: float64

#### Validating model through example messages

Messages categorized as urgent

In [12]:
train[train['need_help'] == 1].sample(10)['message']

707     We need food, water, medicines. Thank you. We ...
8604    canned and fresh food ( apples and veggies ) d...
4360    We are in Carrefour Airport streets every day,...
5408    HI,I NEVER FIND ANYTHING TO SURVIVE WITH MY SE...
8729    I have baby blankets , diapers , formula , war...
2755    Is it possible to know how many people died du...
5795    they gave me some tablets which can treat wate...
1219    Im Haitian and i've lost everything i own. I d...
3940    On Dumas we are hungry - in the Croix des Bouq...
2542    We are tired of sending messages. They do not ...
Name: message, dtype: object

In [13]:
train[train['need_help'] == 1]['message'][527]

'we live in Fontamara 27. We have problems with lack of food and shelter. please help us'

In [14]:
train[train['need_help'] == 1]['message'][540]

"I am asking for your help please, because we can't no longer take this. Come rescue us, we can't find any help. Just stop by and come see us. I am waiting and I thamk you in advance"

In [15]:
train[train['need_help'] == 1]['message'][3814]

'We are located in the first section of PetitBois, commune of Croix des Bouquets, we are in needs of food,tents in Dume, waiting for your help. '

In [16]:
train[train['need_help'] == 1]['message'][9053]

'I have jackets , hoodies , baby formula , baby bottles and new nipples for bottles , rice cereal for babies , toys for babies and toddlers , towels , baby clothes and dried food'

### Message processing

- Tokenizing
- Lemmatizing


In [8]:
# Creating functions for tokenizing and lemmatizing

# Processing message data

def tokenize(x):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)

train['tokens'] = train['message'].map(tokenize)
    
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])

train['lemma'] = train['tokens'].map(lemmatize)

In [18]:
train.sample(10)

Unnamed: 0,message,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,...,floods,storm,fire,earthquake,cold,other_weather,direct_report,need_help,tokens,lemma
10609,"According to the BNGRC's Soa, ""Ivan passed tho...",news,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,"[According, to, the, BNGRC, s, Soa, Ivan, pass...",According to the BNGRC s Soa Ivan passed thoug...
17342,Our own domestic disaster response model can s...,news,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[Our, own, domestic, disaster, response, model...",Our own domestic disaster response model can s...
2725,"From where I am lying, I cannot walk to Delmas...",direct,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[From, where, I, am, lying, I, cannot, walk, t...",From where I am lying I cannot walk to Delmas ...
12459,"In this Republic, the rice and potato crop vir...",news,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[In, this, Republic, the, rice, and, potato, c...",In this Republic the rice and potato crop virt...
10143,People braving the early stages of the storm. ...,social,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,"[People, braving, the, early, stages, of, the,...",People braving the early stage of the storm hu...
6321,"Cyclon! but we don't have, we are in trouble",direct,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[Cyclon, but, we, don, t, have, we, are, in, t...",Cyclon but we don t have we are in trouble
11754,The association has set up a service consistin...,news,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,"[The, association, has, set, up, a, service, c...",The association ha set up a service consisting...
5710,"iv, santo 6 I would like us me messenger the g...",direct,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,"[iv, santo, 6, I, would, like, us, me, messeng...",iv santo 6 I would like u me messenger the goo...
2215,We are almost dead at Impasse Mousin ( mouzen ...,direct,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,"[We, are, almost, dead, at, Impasse, Mousin, m...",We are almost dead at Impasse Mousin mouzen of...
14957,Tractors and Oxen: The use of tractors continu...,news,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[Tractors, and, Oxen, The, use, of, tractors, ...",Tractors and Oxen The use of tractor continues...


### Vectorizing before modelling (Tfidf)

In [19]:
tf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
tf.fit_transform(train['lemma'])

<21046x215177 sparse matrix of type '<class 'numpy.float64'>'
	with 524279 stored elements in Compressed Sparse Row format>

## Modeling

### Select model and gridsearch parameters according to preference

Random forest, logistic regression, naive bayes are given in the notebook.

### Random Forest Gridsearch

In [20]:
# Creating X and y for training data (split already done)

X = train['lemma']
y = train['need_help']

In [None]:
# Random forest gridsearch

rf = RandomForestClassifier()


pgrid_rf = {
    'tf__max_features' : [2000, 3000, 5000],
    'tf__stop_words' : ['english', None],
    'tf__ngram_range' : [(1,2)],
    'tf__use_idf' : [True, False],
    'rf__n_estimators' : [5, 10, 35],
    'rf__max_depth' : [4, 5, 6],
    'rf__max_features' : [None, 3, 6]
}

pipe_rf = Pipeline(steps = [('tf', TfidfVectorizer()), ('rf', RandomForestClassifier())])

gs_rf = GridSearchCV(pipe_rf, pgrid_rf, cv = 5, n_jobs = -1, verbose=1)

gs_rf.fit(X, y)

gs_rf.score(X, y)

### Logistic Regression for coefficients


In [53]:
tf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=.01)

lr = LogisticRegression()

X_lr = tf.fit_transform(X)

lr.fit(X_lr, y)

# Creating a dataframe from exponentiated coeficients and vectorizer vocabulary


coefs = pd.DataFrame(np.exp(lr.coef_), columns=tf.vocabulary_).T




coefs.columns = ['coef']

coefs.sort_values(by = 'coef', ascending=False)[0:10]

Unnamed: 0,coef
home,50.185096
need help,29.974784
near,27.524558
important,23.202539
national,22.495263
damage,20.193607
province,12.800056
according,12.768996
flood,11.908526
condition,10.770423


### Reading in tweet data

In [21]:
tweets = pd.read_csv('datasets/df_combined.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
tweets.head()

Unnamed: 0,date,id,tweet
0,2017-08-13,8.96827e+17,Gert could become a quite intense post-tropica...
1,2017-08-14,8.97013e+17,"Weather Street: Tropical Storm Harvey, Hurrica..."
2,2017-08-14,8.97087e+17,Tropical Storm #Gert intensifying. Tropical St...
3,2017-08-14,8.97088e+17,Tropical Storm #Gert intensifying. Tropical St...
4,2017-08-14,8.97088e+17,"RT YourNews15 ""Tropical Storm #Gert intensifyi..."


### Processing tweet data

In [23]:
tweets['tokens'] = tweets['tweet'].map(tokenize)
    
tweets['lemma'] = tweets['tokens'].map(lemmatize)

### Predicting based on tweet data

#### Random forest predictions

In [24]:
X_tweets = tweets['lemma']

In [None]:
preds = gs_rf.predict(X_tweets)

tweets['need_help'] = preds

tweets['need_help'].value_counts()

In [None]:
tweets[tweets['need_help'] == 1].sample(10)['tweet']

In [None]:
tweets[['need_help']].to_csv('labels.csv')

### Multinomial Naive Bayes 

In [25]:
pipe_nb = Pipeline(steps = [('tf', TfidfVectorizer()), ('nb', MultinomialNB())])

pgrid_nb = {
    'tf__max_features' : [2000, 3000, 5000],
    'tf__stop_words' : ['english', None],
    'tf__ngram_range' : [(1,2), (1,2)],
    'tf__use_idf' : [True, False],
    'nb__alpha' : [0.1, 0.5, 1]
}

gs_nb = GridSearchCV(pipe_nb,pgrid_nb,cv=5,n_jobs=-1, verbose=1)

gs_nb.fit(X, y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  3.7min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [26]:
gs_nb.score(X, y)

0.9004086287180462

### Naive Bayes predictions on tweet data

In [29]:
nb_preds = gs_nb.predict(X_tweets)

tweets['nb_label'] = nb_preds


tweets[tweets['nb_label'] == 1].sample(10)['tweet']

245883    Anybody know if there is anyway to donate mone...
87960     My power just went out In Katy,  tx and we hav...
266573    There are literally people dying from #Hurrica...
265757     #WakeUpAmerica \n\n#PresidentTrump is doing e...
83590     Anyone know where they have Fiji Water? I can'...
249344    please donate to an org that helps victims of ...
122814    They really need to start giving these hurrica...
61405     Weight gained during #hurricaneharvey, 2lbs an...
63338     God bless #Texas. Texans are a resilient bunch...
247275    But will @TheNotoriousMMA and @FloydMayweather...
Name: tweet, dtype: object

### Checking for proper classification

In [30]:
# Actually needs help but no address
tweets['tweet'][266083]

'PLEASE HELP FIND HIM @RedCross #HurricaneHarvey #ElijahGriffin #5Yearsold #Houston '

In [31]:
# Needs help but not urgent. Incomplete location
tweets['tweet'][70478]

'No electricity, but lots of ice, canned food and water... #HurricaneHarvey safe in Ingleside, TX'

In [32]:
# Needs help and provides address
tweets['tweet'][267654 ]

"I've got a scared friend at 4724 Amalie St. please send help @KHOU @houstonpolice @abc13houston @HoustonTX  "

In [33]:
# Not properly classified- joke
tweets['tweet'][36323]

"Me: *texts food and necessities we need for the weekend*\r\nMom: Okay but just so you know, I GOT SANGRIA SO WE'RE GOOD! #HurricaneHarvey"

In [None]:
pd.DataFrame(nb_preds, columns = ['nb_label']).to_csv('nb_labels.csv')

### Twitter word counts

**Proceed with caution**

Creating a vocabulary dataframe will lead to a MemoryError (personal machine has 12 gb available memory)

In [None]:
cvec = CountVectorizer(ngram_range=(1,2), stop_words='english', max_df=.9)
cvec.fit(tweets['lemma'])

In [2]:
#cvec.vocabulary_.keys()

In [None]:
pd.DataFrame(cvec.vocabulary_, index= range(len(cvec.vocabulary_.keys())))