In [1]:
import os
import gc
import re
import string
import operator
import numpy as np # Linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS
from collections import defaultdict
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.models import Model, Sequential
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import tokenization
SEED = 1337

In [2]:
pwd

'/root/capsule/code'

In [3]:
# Reading the training and testing dataset
train_df = pd.read_csv("/root/capsule/data/train.csv")
test_df = pd.read_csv("/root/capsule/data/test.csv")

In [4]:
print('There are {} rows and {} columns in training data frame\nTraining data frame memory usage is {:.2f} MB'.format(train_df.shape[0], train_df.shape[1], train_df.memory_usage().sum() / 1024**2))

There are 7613 rows and 5 columns in training data frame
Training data frame memory usage is 0.29 MB


In [5]:
print('There are {} rows and {} columns in testing data frame\nTesting data frame memory usage is {:.2f} MB'.format(test_df.shape[0], test_df.shape[1], test_df.memory_usage().sum() / 1024**2))

There are 3263 rows and 4 columns in testing data frame
Testing data frame memory usage is 0.10 MB


In [6]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [7]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [8]:
# Getting counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])
#.todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [9]:
# Creating vectors for all of the tweets
train_vectors = count_vectorizer.fit_transform(train_df["text"])
# note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

In [10]:
clf = linear_model.RidgeClassifier()

In [11]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59421842, 0.56498283, 0.64113893])

In [12]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [13]:
sample_submission = pd.read_csv("/root/capsule/data/sample_submission.csv")

In [14]:
sample_submission["target"] = clf.predict(test_vectors)

In [15]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [16]:
sample_submission.to_csv("submission.csv", index=False)

In [85]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))
test_df['word_count'] = test_df['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
train_df['unique_word_count'] = train_df['text'].apply(lambda x: len(set(str(x).split())))
test_df['unique_word_count'] = test_df['text'].apply(lambda x: len(set(str(x).split())))

# stop_word_count
train_df['stop_word_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
test_df['stop_word_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# url_count
train_df['url_count'] = train_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_df['url_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# mean_word_length
train_df['mean_word_length'] = train_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df['mean_word_length'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# char_count
train_df['char_count'] = train_df['text'].apply(lambda x: len(str(x)))
test_df['char_count'] = test_df['text'].apply(lambda x: len(str(x)))

# punctuation_count
train_df['punctuation_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_df['punctuation_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hashtag_count
train_df['hashtag_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
train_df['mention_count'] = train_df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
test_df['mention_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

In [86]:
# Concatenate pandas objects along a particular axis with optional set logic along the other axes.
df=pd.concat([train_df,test_df])
df.shape

(10876, 14)

In [19]:
# Removing The URLs from the training and testing dataset
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [20]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

In [21]:
# Removing HTML tags from the training and testing dataset
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [22]:
df['text']=df['text'].apply(lambda x : remove_html(x))

In [23]:
# Removing the emojis from the traing and testing dataset
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [24]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [25]:
# Removing punstuations from training and testing dataset
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [26]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

In [27]:
# Using PySpellChecker correct the spelling of training and testing dataset
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [28]:
df['text']=df['text'].apply(lambda x : correct_spellings(x))

In [30]:
!pip install nltk

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 16.0MB/s eta 0:00:01
[?25hCollecting click (from nltk)
[?25l  Downloading https://files.pythonhosted.org/packages/dd/c0/4d8f43a9b16e289f36478422031b8a63b54b6ac3b1ba605d602f10dd54d6/click-7.1.1-py2.py3-none-any.whl (82kB)
[K     |████████████████████████████████| 92kB 32.2MB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306
Successfully built nltk
Installing collected packages: click, nltk
Successfully installed click-7.1.1 nltk-3.5


In [67]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
import re
import tensorflow as tf
from tensorflow import keras

In [68]:
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [87]:
def create_corpus(df):
    corpus=[]
    for train_df in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(train_df) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus


In [88]:
corpus=create_corpus(df)

100%|██████████| 10876/10876 [00:02<00:00, 4303.95it/s]


In [89]:
embedding_dict={}
with open('/root/capsule/data/Glove/glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [90]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

train_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [91]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 18736


In [92]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 18736/18736 [00:00<00:00, 321290.65it/s]


In [93]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [76]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           1873700   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,916,005
Trainable params: 42,305
Non-trainable params: 1,873,700
_________________________________________________________________


In [94]:
train=train_pad[:train_df.shape[0]]
test=train_pad[train_df.shape[0]:]

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
X_train,X_test,y_train,y_test=train_test_split(train,train_df['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [80]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/15
 - 81s - loss: 0.6906 - accuracy: 0.5787 - val_loss: 0.6915 - val_accuracy: 0.5219
Epoch 2/15
 - 80s - loss: 0.6579 - accuracy: 0.6192 - val_loss: 0.5980 - val_accuracy: 0.7294
Epoch 3/15
 - 80s - loss: 0.5908 - accuracy: 0.7140 - val_loss: 0.5531 - val_accuracy: 0.7513
Epoch 4/15
 - 80s - loss: 0.5691 - accuracy: 0.7246 - val_loss: 0.5371 - val_accuracy: 0.7618
Epoch 5/15
 - 80s - loss: 0.5486 - accuracy: 0.7418 - val_loss: 0.5275 - val_accuracy: 0.7636
Epoch 6/15
 - 80s - loss: 0.5502 - accuracy: 0.7444 - val_loss: 0.5148 - val_accuracy: 0.7706
Epoch 7/15
 - 80s - loss: 0.5430 - accuracy: 0.7481 - val_loss: 0.5118 - val_accuracy: 0.7636
Epoch 8/15
 - 80s - loss: 0.5415 - accuracy: 0.7506 - val_loss: 0.5080 - val_accuracy: 0.7627
Epoch 9/15
 - 80s - loss: 0.5308 - accuracy: 0.7594 - val_loss: 0.5035 - val_accuracy: 0.7732
Epoch 10/15
 - 80s - loss: 0.5324 - accuracy: 0.7633 - val_loss: 0.5036 - val_accuracy: 0.7697
Epoch 11/15

In [81]:
submission = pd.read_csv("/root/capsule/data/sample_submission.csv")

In [83]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':submission['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission-GLoVe.csv',index=False)

In [97]:
!curl https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

# coding=utf-8
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes implementation.

The file is forked from:
https://github.com/google-research/bert/blob/master/tokenization.py.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import re
import unicodedata

import six
import tensorflow as tf

import sent

In [98]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization

In [99]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [100]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [101]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 6.59 s, sys: 769 ms, total: 7.36 s
Wall time: 7.12 s


In [102]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [103]:
train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
train_labels = train_df.target.values

In [104]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [105]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=15,
    batch_size=16
)

model.save('model.h5')

Train on 6090 samples, validate on 1523 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [106]:
test_pred = model.predict(test_input)

In [107]:
submission = pd.read_csv("/root/capsule/data/sample_submission.csv")

In [108]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission-Bert-15.csv', index=False)