# Disaster Tweets
## Data Visualization

In [1]:
# CSV reading
import pandas as pd
import numpy as np

df_train = pd.read_csv("train.csv", delimiter=',')
df_test = pd.read_csv("test.csv", delimiter=',')

print("Training dataset dimension : {}".format(df_train.shape))
print("Test dataset dimension : {}".format(df_test.shape))

Training dataset dimension : (7613, 5)
Test dataset dimension : (3263, 4)


In [2]:
# Tweet example
print(df_train.head())

# Data description
print("Data description :")
print(df_train.dtypes)
print(df_train.describe(include="all"))
print("\n")

# Missing data
print("Missing data :")
print(round(df_train.isnull().sum()*100/len(df_train),2).sort_values(ascending=False))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
Data description :
id           int64
keyword     object
location    object
text        object
target       int64
dtype: object
                  id     keyword location  \
count    7613.000000        7552     5080   
unique           NaN         221     3341   
top              NaN  fatalities      USA   
freq             NaN          45      104   
mean     5441.934848         NaN      NaN   
std      3137.116090         NaN      NaN   
min         1.000000  

### Keyword

In [3]:
pd.set_option('display.max_rows', 500)
df_train.groupby('keyword').agg('mean')[['target']].sort_values(by='target',ascending=False)

Unnamed: 0_level_0,target
keyword,Unnamed: 1_level_1
debris,1.0
wreckage,1.0
derailment,1.0
outbreak,0.975
oil%20spill,0.973684
typhoon,0.973684
suicide%20bombing,0.969697
suicide%20bomber,0.967742
bombing,0.931034
rescuers,0.914286


## Data engineering

In [4]:
# We extract the targets
targets = df_train['target']
print("Targets dataset dimension : {}".format(targets.shape))
df_train.drop(['target'], 1, inplace=True)

# Data combination
N_train = df_train.shape[0]
df_combined = df_train.append(df_test)
df_combined.reset_index(inplace=True)
print("Combined dataset dimension : {}".format(df_combined.shape))

Targets dataset dimension : (7613,)
Combined dataset dimension : (10876, 5)


In [5]:
# Tweet example
print(df_combined.head())

# Data description
print("Data description :")
print(df_combined.dtypes)
print(df_combined.describe(include="all"))
print("\n")

# Missing data
print("Missing data :")
print(round(df_combined.isnull().sum()*100/len(df_combined),2).sort_values(ascending=False))

   index  id keyword location  \
0      0   1     NaN      NaN   
1      1   4     NaN      NaN   
2      2   5     NaN      NaN   
3      3   6     NaN      NaN   
4      4   7     NaN      NaN   

                                                text  
0  Our Deeds are the Reason of this #earthquake M...  
1             Forest fire near La Ronge Sask. Canada  
2  All residents asked to 'shelter in place' are ...  
3  13,000 people receive #wildfires evacuation or...  
4  Just got sent this photo from Ruby #Alaska as ...  
Data description :
index        int64
id           int64
keyword     object
location    object
text        object
dtype: object
               index            id   keyword location  \
count   10876.000000  10876.000000     10789     7238   
unique           NaN           NaN       221     4521   
top              NaN           NaN  upheaval      USA   
freq             NaN           NaN        50      141   
mean     3153.460004   5437.500000       NaN      NaN   
s

## Keyword

In [6]:
# We make keyword as a dummy variable
df_combined_keyword = pd.DataFrame()
df_combined_keyword['keyword'] = df_combined['keyword']

keyword_dummies = pd.get_dummies(df_combined_keyword['keyword'], prefix='keyword')
df_combined_keyword = pd.concat([df_combined_keyword, keyword_dummies], axis=1)
df_combined_keyword.drop('keyword', axis=1, inplace=True)

data_combined_keyword = df_combined_keyword.to_numpy()

## Location

In [7]:
# Top location occurences
k = 100
top_k_locations = df_combined['location'].value_counts()[:k].sort_values(ascending=False)
pd.set_option('display.max_rows', 200)
print(top_k_locations)

USA                               141
New York                          109
United States                      65
London                             58
Canada                             42
Nigeria                            40
India                              35
Worldwide                          35
Los Angeles, CA                    34
UK                                 33
Kenya                              32
Washington, DC                     31
Mumbai                             28
United Kingdom                     26
California                         25
Australia                          25
Los Angeles                        24
San Francisco                      23
Chicago, IL                        23
New York, NY                       22
Florida                            21
California, USA                    21
NYC                                21
San Francisco, CA                  20
Chicago                            17
Toronto                            16
Washington, 

In [8]:
# We extract the location before the comma
df_combined['location_cleansed'] = df_combined.location.str.split(',').str[0] # All letters before the ,

# New top k
top_k_location_values = df_combined['location_cleansed'].value_counts()[:k].sort_values(ascending=False)
pd.set_option('display.max_rows', 200)
print(top_k_location_values )

# Other locations to set as NaN
bottom_k_location_keys = df_combined['location_cleansed'].value_counts().keys()[k:].sort_values(ascending=False)
bottom_k_location_keys = bottom_k_location_keys.to_numpy()
df_combined['location_cleansed'] = df_combined['location_cleansed'].replace(to_replace = bottom_k_location_keys, value = "NaN")

print(df_combined.describe(include="all"))

USA                               147
New York                          143
London                             89
Los Angeles                        68
United States                      66
Washington                         61
California                         53
Chicago                            50
San Francisco                      44
Canada                             43
Nigeria                            43
Mumbai                             37
Worldwide                          35
UK                                 35
India                              35
Calgary                            33
Kenya                              32
Seattle                            32
Toronto                            32
United Kingdom                     27
Florida                            27
Denver                             26
Atlanta                            25
Australia                          25
NYC                                24
Manchester                         24
Texas       

In [9]:
# We make location_cleansed as a dummy variable

df_combined_location_cleansed = pd.DataFrame()
df_combined_location_cleansed['location_cleansed'] = df_combined['location_cleansed']

location_cleansed_dummies = pd.get_dummies(df_combined_location_cleansed['location_cleansed'], prefix='location_cleansed')
df_combined_location_cleansed = pd.concat([df_combined_location_cleansed, location_cleansed_dummies], axis=1)
df_combined_location_cleansed.drop('location_cleansed', axis=1, inplace=True)

print(df_combined_location_cleansed.describe(include="all"))
data_combined_location_cleansed = df_combined_location_cleansed.to_numpy()

       location_cleansed_ Road to the Billionaires Club  \
count                                      10876.000000   
mean                                           0.000919   
std                                            0.030310   
min                                            0.000000   
25%                                            0.000000   
50%                                            0.000000   
75%                                            0.000000   
max                                            1.000000   

       location_cleansed_304  location_cleansed_??????  \
count           10876.000000              10876.000000   
mean                0.001011                  0.000828   
std                 0.031788                  0.028756   
min                 0.000000                  0.000000   
25%                 0.000000                  0.000000   
50%                 0.000000                  0.000000   
75%                 0.000000                  0.000000   
max 

## Text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import wordpunct_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import words
from string import punctuation

class LemmaTokenizer(object):
    def __init__(self, remove_non_words=True):
        self.wnl = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        self.words = set(words.words())
        self.remove_non_words = remove_non_words
    def __call__(self, doc):
        # tokenize words and punctuation
        word_list = wordpunct_tokenize(doc)
        # remove stopwords
        word_list = [word for word in word_list if word not in self.stopwords]
        # remove non words
        if(self.remove_non_words):
            word_list = [word for word in word_list if word in self.words]
        # remove 1-character words
        word_list = [word for word in word_list if len(word)>3]
        # remove non alpha
        word_list = [word for word in word_list if word.isalpha()]
        return [self.wnl.lemmatize(t) for t in word_list]
# Processing
countvect = CountVectorizer(tokenizer=LemmaTokenizer(remove_non_words=False))
word_count = countvect.fit_transform(df_combined['text'])
feat2word = {v: k for k, v in countvect.vocabulary_.items()}
print("Number of words:", len(feat2word))

Number of words: 17404


In [11]:
# Visualization of the text processing

n = 59 # Tweet number

print("Original text")
text = df_combined['text'][n]
print(text)

print("Dictionary")
textBagOfWords = {feat2word[i]: word_count[n, i] for i in word_count[n, :].nonzero()[1]}
print(textBagOfWords)

print("Vector")
print(word_count[n, :])

Original text
How the West was burned: Thousands of wildfires ablaze in #California alone http://t.co/iCSjGZ9tE1 #climate #energy http://t.co/9FxmN0l0Bd
Dictionary
{'wildfire': 1, 'california': 1, 'ablaze': 1, 'http': 2, 'west': 1, 'burned': 1, 'thousand': 1, 'alone': 1, 'climate': 1, 'energy': 1}
Vector
  (0, 16780)	1
  (0, 2177)	1
  (0, 42)	1
  (0, 7056)	2
  (0, 16669)	1
  (0, 2060)	1
  (0, 15291)	1
  (0, 453)	1
  (0, 2800)	1
  (0, 4800)	1


In [12]:
# Term frequency-inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer().fit_transform(word_count)

In [13]:
# Visualization of the text processing

n = 0 # Tweet number

print("Original text")
text = df_combined['text'][n]
print(text)

print("Dictionary")
textBagOfWords = {feat2word[i]: tfidf[n, i] for i in tfidf[n, :].nonzero()[1]}
print(textBagOfWords)

print("Vector")
print(tfidf[n,:])

Original text
Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Dictionary
{'reason': 0.37458579040936646, 'forgive': 0.5014479476131515, 'earthquake': 0.3507491579786775, 'deed': 0.530941719553609, 'allah': 0.45090060997148146}
Vector
  (0, 12322)	0.37458579040936646
  (0, 5650)	0.5014479476131515
  (0, 4541)	0.3507491579786775
  (0, 3752)	0.530941719553609
  (0, 422)	0.45090060997148146


In [14]:
from scipy.sparse import csr_matrix
tfidf = csr_matrix.toarray(tfidf)
data_combined = np.concatenate((tfidf,data_combined_keyword),axis=1)
data_combined = np.concatenate((data_combined,data_combined_location_cleansed),axis=1)
#data_combined = np.concatenate((tfidf,data_combined_location_cleansed),axis=1)
data_combined = tfidf
print(data_combined.shape)

(10876, 17404)


In [15]:
from sklearn.model_selection import train_test_split
train=data_combined[:N_train]
test=data_combined[N_train:]
X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=0.3, random_state=5)

In [16]:
Models = []

# Random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
#Models.append(rf)

# Log reg
from sklearn.linear_model import LogisticRegression
lr_classifier=LogisticRegression(penalty='l2', C=1.,solver = 'saga')
Models.append(lr_classifier)

# Multinomial NB
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
Models.append(mnb)

# NN
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(25),solver='sgd', max_iter=2000, learning_rate_init=0.01)
#Models.append(nn)

# SVM Linear
from sklearn import svm
svm_lin = svm.SVC(kernel='linear', C=1.)
#Models.append(svm_lin)

# SVM RBF
svm_rbf = svm.SVC(kernel='rbf')
#Models.append(svm_rbf)

In [17]:
import random
import numpy as np
from sklearn.model_selection import StratifiedKFold

# Cross validation for each model
def learn(models, N):
    Kfold = StratifiedKFold(n_splits=N, shuffle=True)
    for model in models:
        print(model)
        score = []
        for train_index, test_index in Kfold.split(train, targets):
            Xtrain, Xtest = train[train_index], train[test_index]
            ytrain, ytest = targets[train_index], targets[test_index]
            model.fit(Xtrain,ytrain);
            score += [model.score(Xtest,ytest)]
            print('*', end='')
        print(" done!")
        print("Average generalization score:", np.mean(score))
        print("Standard deviation:", np.std(score))
        print()
        
learn(Models,4)

LogisticRegression(solver='saga')
**** done!
Average generalization score: 0.7962693386603196
Standard deviation: 0.0056590043918633365

MultinomialNB()
**** done!
Average generalization score: 0.79824011843308
Standard deviation: 0.010109295357390545



In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Hyperparameters of log reg
parameters = {'solver':['saga'],'penalty':['l2'], 'C':np.arange(1,2,1)}
lr_classifier=LogisticRegression(solver = 'saga')
clf = GridSearchCV(lr_classifier, parameters, verbose = 1)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished


Best parameters set found on development set:

{'C': 1, 'penalty': 'l2', 'solver': 'saga'}

Grid scores on development set:

0.795 (+/-0.019) for {'C': 1, 'penalty': 'l2', 'solver': 'saga'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.76      0.90      0.83      1304
           1       0.83      0.62      0.71       980

    accuracy                           0.78      2284
   macro avg       0.80      0.76      0.77      2284
weighted avg       0.79      0.78      0.78      2284




## Submission

In [19]:
# Training of the model
lr_classifier=LogisticRegression(penalty='l2', C=2.,solver = 'saga')
lr_classifier.fit(train,targets)
# Prediction on the test dataset
y_pred = lr_classifier.predict(test)

In [20]:
# Formatting of the output
dfypred = pd.DataFrame({'target': y_pred})
dfsubmission = pd.concat([df_test['id'],dfypred],axis=1)

# CSV creation
dfsubmission.to_csv('out.csv', index = False)