###  Simple model for the disaster tweet analysis

The aim is to get a maximum possible score of the classification of the disaster tweet without using any pretrained model.
The best score in public notebooks is obtained so far by using a pretrained BERT model with the avarage score=0.84 

To run this notebook load the following file from the Kaggle https://www.kaggle.com/c/nlp-getting-started 
 - train.csv
 - test.csv
 - sample_submission.csv
 
The best public test score for this model is 0.7961



This notebook is created by Vitaly Shklyar


In [1]:
import numpy as np 
import pandas as pd 
from sklearn import feature_extraction
from sklearn.utils import shuffle
from nltk import FreqDist 

from tensorflow.keras import layers
from tensorflow.keras.layers import GRU,LSTM,Embedding, Input, Dense
from tensorflow.keras.models import Model, Sequential

from tensorflow.keras import regularizers, callbacks

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import  l2,l1,Regularizer
import re

In [2]:
def unique_words(df_text):
    words = []
    for s in df_text:
        words += s.split() 
    return set(words)

def keep_words(sentence, words_list ):
    words = sentence.split()
    return ' '.join([w for w in words if w in words_list])

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def find_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE)
    print( emoji_pattern.findall(r'', text))
    
def lowerer(t):
    return t.lower()    

def remove_hyperlins(text):
    res = re.sub(r'(\S{0,}http\S{0,})+',' ',text.lower())
    return res

def keep_letters_only(text):
    return ' '.join(re.findall(r'[a-zA-Z]{3,}',text))

def remove_ets(text):
    res = re.sub(r'(\S{0,}@\S{0,})+',' ',text.lower())
    return res

def decontracted(phrase):
    """Convert contractions like "can't" into "can not"
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    #phrase = re.sub(r"n't", " not", phrase) # resulted in "ca not" when sentence started with "can't"
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean(tweet):
    """
    substitute special charackters and number by empty spaces; may be it is not good since the numbers could be inromative 
    """
    # Punctuations at the start or end of words    
    for punctuation in "!@#$%^&*()_{}[]+?""\':;,./-+=0123456789\n\\":
        tweet = tweet.replace(punctuation, f' ')
    return tweet.strip()

def remove_words(sentence, words_list ):
    words = sentence.split()
    return ' '.join([w for w in words if w not in words_list])

def unitque_words_count(df_text):
    words = []
    for s in df_text:
        words += s.split() 
    return len(set(words))

def prepare_tockens(tockenizer, text):
    tensor = tockenizer.texts_to_sequences(text)
    tensor = pad_sequences(tensor,maxlen=50)
    (n,m ) = tensor.shape
    return tensor.reshape((n,m))
    

### 1. Load the training set and shuffle it

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df  = shuffle(train_df)

### 2. Remove emoji 

In [4]:
train_df['no_emoji'] = train_df['text'].apply(lambda  x: remove_emoji(x))
test_df['no_emoji'] = test_df['text'].apply(lambda  x: remove_emoji(x))

### 3 Lower the text

In [5]:
train_df['text_lower'] = train_df['no_emoji'].apply(lambda  x: lowerer(x))
test_df['text_lower'] = test_df['no_emoji'].apply(lambda  x: lowerer(x))

### 3 Remove hyperlinks


In [6]:
train_df['no_hyperlinks'] = train_df['text_lower'].apply(lambda  x: remove_hyperlins(x))
test_df['no_hyperlinks'] = test_df['text_lower'].apply(lambda  x: remove_hyperlins(x))

In [7]:
train_df['no_ets'] = train_df['no_hyperlinks'].apply(lambda  x: remove_ets(x))
test_df['no_ets'] = test_df['no_hyperlinks'].apply(lambda  x: remove_ets(x))

### 4 Decontract 

In [8]:
train_df["decontracted"] = train_df["no_ets"].apply(lambda x: decontracted(x))
test_df["decontracted"]  = test_df["no_ets"].apply(lambda x: decontracted(x))

### 5 Remove special characters


In [9]:
train_df['alpha'] = train_df['decontracted'].apply(lambda  x: clean(x))
test_df['alpha'] = test_df['decontracted'].apply(lambda  x: clean(x))

### 6 Find the rare words in the training set

In [10]:
words = []
word_frequencsy=2
for s in  train_df["alpha"]:
    words+=s.split()
print(f"unique words: {len(set(words) ) } ")

fdist1 = FreqDist(words)

rare_words = [w for w in set(words) if fdist1[w] <= word_frequencsy]
print(f"number of rare words which appear less then {word_frequencsy} times, count = {len(rare_words)}" )

unique words: 14493 
number of rare words which appear less then 2 times, count = 10148


### 7. Concatenate the rare and unique words and remove then from the training and the test sets

In [11]:
words_to_remove=feature_extraction.text.ENGLISH_STOP_WORDS.union(rare_words)

In [12]:
train_df['text_no_rare_words']      = train_df['alpha'].apply(lambda x : remove_words(x,words_to_remove))
test_df['text_no_rare_words']       = test_df['alpha'].apply(lambda x : remove_words(x,words_to_remove))


### 8. Check the number of unique words 

In [13]:
unitque_words_count(train_df['text_no_rare_words'])

4094

In [14]:
unitque_words_count(test_df['text_no_rare_words'])

6634

### 9. Keep words with length => 3 and consisting only from letters

In [15]:
train_df["cleared_letters_only"] = train_df["text_no_rare_words"].apply(lambda x:keep_letters_only(x) )

In [16]:
test_df["cleared_letters_only"] = test_df["text_no_rare_words"].apply(lambda x:keep_letters_only(x) )

In [17]:
unitque_words_count(train_df["cleared_letters_only"])

3897

In [18]:
unitque_words_count(test_df["cleared_letters_only"])

6356

### 10. Tockenize for embedding

In [19]:
train_txt      = train_df["cleared_letters_only"]
test_txt       = test_df["cleared_letters_only"]

In [20]:
lang_tokenizer = Tokenizer( filters='')
lang_tokenizer.fit_on_texts(train_txt)

In [21]:
train_np = prepare_tockens(lang_tokenizer, train_txt)

### 11. Create and fit the model

In [32]:
model = Sequential()
model.add(Embedding(4000, 64, embeddings_regularizer=regularizers.l2(0.0000003) ))
model.add(GRU(16, recurrent_regularizer=regularizers.l2(0.00001) ) )
model.add(Dense(1, activation = "sigmoid"))

In [33]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])

In [36]:
early_stopping = callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True, verbose=1)
history = model.fit(train_np, train_df["target"].values, batch_size =16 , epochs =100, validation_split=0.2, shuffle = True, callbacks = [early_stopping])

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Restoring model weights from the end of the best epoch.
Epoch 00004: early stopping


In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 60)          240000    
_________________________________________________________________
gru (GRU)                    (None, 10)                2160      
_________________________________________________________________
dense (Dense)                (None, 1)                 11        
Total params: 242,171
Trainable params: 242,171
Non-trainable params: 0
_________________________________________________________________


### 12. Keep only known words in the test set for predictions

In [26]:
words_to_keep = unique_words(train_df["cleared_letters_only"])

In [27]:
test_df['cleared_letters_only'] = test_df['cleared_letters_only'].apply(lambda x : keep_words(x,words_to_keep))

In [28]:
unitque_words_count(test_df["cleared_letters_only"])

3180

In [29]:
test_txt = test_df["cleared_letters_only"]
test_np  = prepare_tockens(lang_tokenizer, test_txt)

In [30]:
preds = model.predict(x=test_np)
preds[preds>0.5] = 1
preds[preds<0.5] = 0
preds=preds.astype(int)

In [31]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission['target'] = preds.astype(int)
sample_submission.to_csv("SimpleRNNnoPretraining.csv", index=False)