In [121]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from spacy.tokenizer import Tokenizer
import torch
import torch.nn as nn
from torchtext import data, legacy
from torchtext.vocab import Vectors, GloVe
import tqdm
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
smaple = pd.read_csv('../sample_submission.csv')
print(train.shape)
print(test.shape)

(7613, 5)
(3263, 4)


# Exploration

In [81]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [82]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Data Cleaning

In [83]:
print(train.info())
print("=================")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
None


In [84]:
train.drop(['location', 'keyword'], axis=1, inplace=True)
test.drop(['location', 'keyword'], axis=1, inplace=True)

In [85]:
train.shape, test.shape

((7613, 3), (3263, 2))

In [86]:
for i in range(50):
    print(train['text'][i])
    print("------------")

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
------------
Forest fire near La Ronge Sask. Canada
------------
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
------------
13,000 people receive #wildfires evacuation orders in California 
------------
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
------------
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
------------
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
------------
I'm on top of the hill and I can see a fire in the woods...
------------
There's an emergency evacuation happening now in the building across the street
------------
I'm afraid that the tornado is coming to our area...
------------
Three people died from the heat wave so far
------------
Haha South Tamp

In [87]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', 
          '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
          '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×',
          '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', 
          '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', 
          '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 
          'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', 
          '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 
          'Ø', '¹', '≤', '‡', '√', ]


def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5, }', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [88]:
mispell_dict = {"aren't": "are not", "can't": "cannot", 
                "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", 
                "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "he'd": "he would", 
                "he'll": "he will", "he's": "he is",
                "i'd": "I would", "i'd": "I had", "i'll": 
                "I will", "i'm" : "I am", "isn't": "is not",
                "it's": "it is", "it'll": "it will", 
                "i've" : "I have", "let's": "let us", 
                "mightn't": "might not", "mustn't": "must not", 
                "shan't" : "shall not", "she'd": "she would",
                "she'll": "she will", "she's": "she is", 
                "shouldn't": "should not", "that's": "that is", 
                "there's": "there is","they'd": "they would", 
                "they'll": "they will", "they're": "they are",
                "they've": "they have", "we'd": "we would", 
                "we're": "we are", "weren't": "were not",
                "we've": "we have", "what'll": "what will", 
                "what're": "what are", "what's": "what is", 
                "what've": "what have", "where's": "where is", 
                "who'd": "who would", "who'll": "who will",
                "who're": "who are", "who's": "who is", 
                "who've": "who have", "won't": "will not",
                "wouldn't" : "would not", "you'd": "you would", 
                "you'll": "you will", "you're": "you are",
                "you've": "you have", "'re": " are", 
                "wasn't": "was not", "we'll": " will", 
                "didn't": "did not", "tryin'": "trying", 
                "colour": "color", "centre": "center",
                "didnt": "did not", "doesnt": "does not",
                "isnt": "is not", "shouldnt": "should not",
                "favourite": "favorite", "travelling": "traveling",
                "counselling": "counseling", "theatre": "theater",
                "cancelled": "canceled", "labour": "labor",
                "organisation": "organization", "wwii": "world war 2",
                "citicise": "criticize", "instagram": "social medium",
                "whatsapp": "social medium", "snapchat": "social medium"}


def get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [89]:
mispellings, mispellings_re = get_mispell(mispell_dict)
mispellings, mispellings_re

({"'re": ' are',
  "aren't": 'are not',
  "can't": 'cannot',
  'cancelled': 'canceled',
  'centre': 'center',
  'citicise': 'criticize',
  'colour': 'color',
  "couldn't": 'could not',
  'counselling': 'counseling',
  "didn't": 'did not',
  'didnt': 'did not',
  "doesn't": 'does not',
  'doesnt': 'does not',
  "don't": 'do not',
  'favourite': 'favorite',
  "hadn't": 'had not',
  "hasn't": 'has not',
  "haven't": 'have not',
  "he'd": 'he would',
  "he'll": 'he will',
  "he's": 'he is',
  "i'd": 'I had',
  "i'll": 'I will',
  "i'm": 'I am',
  "i've": 'I have',
  'instagram': 'social medium',
  "isn't": 'is not',
  'isnt': 'is not',
  "it'll": 'it will',
  "it's": 'it is',
  'labour': 'labor',
  "let's": 'let us',
  "mightn't": 'might not',
  "mustn't": 'must not',
  'organisation': 'organization',
  "shan't": 'shall not',
  "she'd": 'she would',
  "she'll": 'she will',
  "she's": 'she is',
  "shouldn't": 'should not',
  'shouldnt': 'should not',
  'snapchat': 'social medium',
  "that's

In [90]:
def remove_emoji(sentence):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', sentence)

def remove_stopwords(sentence):
    words = sentence.split()
    words = [word for word in words if word not in stopwords.words('english')]
    
    return ' '.join(words)

In [91]:
stemmer = SnowballStemmer('english')
wl = WordNetLemmatizer()
def lemmatize_words(sentence):
    words = sentence.split()
    words = [wl.lemmatize(word) for word in words]
    return ' '.join(words)

In [92]:
# Clean the text
train["text"] = train["text"].apply(lambda x: clean_text(x.lower()))
test["text"] = test["text"].apply(lambda x: clean_text(x.lower()))

# Clean numbers
train["text"] = train["text"].apply(lambda x: clean_numbers(x))
test["text"] = test["text"].apply(lambda x: clean_numbers(x))

# Clean spellings
train["text"] = train["text"].apply(lambda x: replace_typical_misspell(x))
test["text"] = test["text"].apply(lambda x: replace_typical_misspell(x))

# Clear emojis
train["text"] = train["text"].apply(lambda x: remove_emoji(x))
test["text"] = test["text"].apply(lambda x: remove_emoji(x))

# Stopwords
train["text"] = train["text"].apply(lambda x: remove_stopwords(x))
test["text"] = test["text"].apply(lambda x: remove_stopwords(x))

# Lemmatization
train["text"] = train["text"].apply(lambda x: lemmatize_words(x))
test["text"] = test["text"].apply(lambda x: lemmatize_words(x))

In [93]:
for i in range(50):
    print(train['text'][i])
    print("------------")

deed reason # earthquake may allah forgive u
------------
forest fire near la ronge sask . canada
------------
resident asked ' shelter place ' notified officer . evacuation shelter place order expected
------------
## , ### people receive # wildfire evacuation order california
------------
got sent photo ruby # alaska smoke # wildfire pours school
------------
# rockyfire update = > california hwy . ## closed direction due lake county fire - # cafire # wildfire
------------
# flood # disaster heavy rain cause flash flooding street manitou , colorado spring area
------------
' top hill see fire wood . . .
------------
' emergency evacuation happening building across street
------------
' afraid tornado coming area . . .
------------
three people died heat wave far
------------
haha south tampa getting flooded hah - wait second live south tampa gonna gonna fvck # flooding
------------
# raining # flooding # florida # tampabay # tampa ## ## day . ' lost count
------------
# flood bago my

In [94]:
print(train.head())
print("\n================================================================\n")
print(test.head())

   id                                               text  target
0   1       deed reason # earthquake may allah forgive u       1
1   4            forest fire near la ronge sask . canada       1
2   5  resident asked ' shelter place ' notified offi...       1
3   6  ## , ### people receive # wildfire evacuation ...       1
4   7  got sent photo ruby # alaska smoke # wildfire ...       1


   id                                               text
0   0                        happened terrible car crash
1   2  heard # earthquake different city , stay safe ...
2   3  forest fire spot pond , goose fleeing across s...
3   9         apocalypse lighting . # spokane # wildfire
4  11              typhoon soudelor kill ## china taiwan


In [95]:
def get_iterator(dataset, batch_size, train=True,
                 shuffle=True, repeat=False):
    
    device = torch.device('cuda:0' if torch.cuda.is_available()
                          else 'cpu')
    
    dataset_iter = legacy.data.Iterator(
        dataset, batch_size=batch_size, device=device,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False
    )
    
    return dataset_iter

In [104]:
def prepare_csv(df_train, df_test, seed=27, val_ratio=0.3):
    idx = np.arange(df_train.shape[0])
    
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    val_size = int(len(idx) * val_ratio)
    
    if not os.path.exists('cache'):
        os.makedirs('cache')
    
    df_train.iloc[idx[val_size:], :][['id', 'target', 'text']].to_csv(
        'cache/dataset_train.csv', index=False
    )
    
    df_train.iloc[idx[:val_size], :][['id', 'target', 'text']].to_csv(
        'cache/dataset_val.csv', index=False
    )
    
    df_test[['id', 'text']].to_csv('cache/dataset_test.csv',
                   index=False)


In [111]:
import logging
from copy import deepcopy
import random
import os

SEED=12
LOGGER = logging.getLogger('tweets_dataset')

def get_dataset(fix_length=100, lower=False, vectors=None):
    
    if vectors is not None:
        lower=True
        
    LOGGER.debug('Preparing CSV files...')
    prepare_csv(train, test)
    
    TEXT = legacy.data.Field(sequential=True, 
                      lower=True, 
                      include_lengths=True, 
                      batch_first=True, 
                      fix_length=25)

    LABEL = legacy.data.Field(use_vocab=True,
                       sequential=False,
                       dtype=torch.float16)

    ID = legacy.data.Field(use_vocab=False,
                    sequential=False,
                    dtype=torch.float16)
    
    LOGGER.debug('Reading train csv files...')
    
    train_temp, val_temp = legacy.data.TabularDataset.splits(
        path='cache/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[
            ('id', ID),
            ('target', LABEL),
            ('text', TEXT)
        ]
    )
    
    LOGGER.debug('Reading test csv file...')
    
    test_temp = legacy.data.TabularDataset(
        path='cache/dataset_test.csv', format='csv',
        skip_header=True,
        fields=[
            ('id', ID),
            ('text', TEXT)
        ]
    )
    
    LOGGER.debug('Building vocabulary...')
    
    TEXT.build_vocab(
        train_temp, val_temp, test_temp,
        max_size=20000,
        min_freq=10,
        vectors=GloVe(name='6B', dim=300)  # We use it for getting vocabulary of words
    )

    LABEL.build_vocab(
        train_temp
    )

    ID.build_vocab(
        train_temp, val_temp, test_temp
    )

    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)
    
    train_iter = get_iterator(train_temp, batch_size=32, 
                              train=True, shuffle=True,
                              repeat=False)

    val_iter = get_iterator(val_temp, batch_size=32, 
                            train=True, shuffle=True,
                            repeat=False)

    test_iter = get_iterator(test_temp, batch_size=32, 
                             train=False, shuffle=False,
                             repeat=False)    
    
    LOGGER.debug('Done preparing the datasets')
    
    return TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter

In [None]:
TEXT, vocab_size, word_embeddings, train_iter, val_iter, test_iter = get_dataset()

In [117]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, weights):
        super(LSTMClassifier, self).__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = torch.nn.Embedding(vocab_size,
                                                  embedding_dim)
        self.word_embeddings.weight.data.copy_(weights)
        
        self.dropout_1 = torch.nn.Dropout(0.3)
        self.lstm = torch.nn.LSTM(embedding_dim,
                                  hidden_dim,
                                  n_layers,
                                  dropout=0.3,
                                  batch_first=True)
        
        self.dropout_2 = torch.nn.Dropout(0.3)
        self.label_layer = torch.nn.Linear(hidden_dim, output_size)
        
        self.act = torch.nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        x = self.word_embeddings(x)
        
        x = self.dropout_1(x)
        
        lstm_out, hidden = self.lstm(x, hidden)
                
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout_2(lstm_out)
        out = self.label_layer(out)    
        
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1, :]

        out = self.act(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden

In [123]:
def train_model(model, train_iter, val_iter, optim, loss, num_epochs, batch_size=32):
    h = model.init_hidden(batch_size)
    
    clip = 5
    val_loss_min = np.Inf
    
    total_train_epoch_loss = list()
    total_train_epoch_acc = list()
        
    total_val_epoch_loss = list()
    total_val_epoch_acc = list()
    
    device = torch.device('cuda:0' if torch.cuda.is_available()
                           else 'cpu')
    
    for epoch in range(num_epochs):

        model.train()
        
        train_epoch_loss = list()
        train_epoch_acc = list()
        
        val_epoch_loss = list()
        val_epoch_acc = list()
        
        for idx, batch in enumerate(train_iter):
            h = tuple([e.data for e in h])

            text = batch.text[0]
            target = batch.target
            target = target - 1
            target = target.type(torch.LongTensor)

            text = text.to(device)
            target = target.to(device)

            optim.zero_grad()
            
            if text.size()[0] is not batch_size:
                continue
            
            prediction, h = model(text, h)
                
            loss_train = loss(prediction.squeeze(), target)
            loss_train.backward()

            num_corrects = (torch.max(prediction, 1)[1].
                                view(target.size()).data == target.data).float().sum()

            acc = 100.0 * num_corrects / len(batch)

            train_epoch_loss.append(loss_train.item())
            train_epoch_acc.append(acc.item())
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            
            optim.step()
    
        print(f'Train Epoch: {epoch}, Training Loss: {np.mean(train_epoch_loss):.4f}, Training Accuracy: {np.mean(train_epoch_acc): .2f}%')

        model.eval()

        with torch.no_grad():
            for idx, batch in enumerate(val_iter):
                val_h = tuple([e.data for e in h])

                text = batch.text[0]
                target = batch.target
                target = target - 1
                target = target.type(torch.LongTensor)
                
                text = text.to(device)
                target = target.to(device)
                
                if text.size()[0] is not batch_size:
                    continue

                prediction, h = model(text, h)
                loss_val = loss(prediction.squeeze(), target)

                num_corrects = (torch.max(prediction, 1)[1].
                                view(target.size()).data == target.data).float().sum()

                acc = 100.0 * num_corrects / len(batch)

                val_epoch_loss.append(loss_val.item())
                val_epoch_acc.append(acc.item())
                
            print(f'Vadlidation Epoch: {epoch}, Training Loss: {np.mean(val_epoch_loss):.4f}, Training Accuracy: {np.mean(val_epoch_acc): .2f}%')
                
            if np.mean(val_epoch_loss) <= val_loss_min:
#                 torch.save(model.state_dict(), 'state_dict.pth')
                print('Validation loss decreased ({:.6f} --> {:.6f})'.
                      format(val_loss_min, np.mean(val_epoch_loss)))
                
                val_loss_min = np.mean(val_epoch_loss)
                
        total_train_epoch_loss.append(np.mean(train_epoch_loss))
        total_train_epoch_acc.append(np.mean(train_epoch_acc))
    
        total_val_epoch_loss.append(np.mean(val_epoch_loss))
        total_val_epoch_acc.append(np.mean(val_epoch_acc))
    
    return (total_train_epoch_loss, total_train_epoch_acc,
            total_val_epoch_loss, total_val_epoch_acc)

In [124]:
lr = 1e-4
batch_size = 32
output_size = 2
hidden_size = 128
embedding_length = 300

model = LSTMClassifier(vocab_size=vocab_size, 
                       output_size=output_size, 
                       embedding_dim=embedding_length,
                       hidden_dim=hidden_size,
                       n_layers=2,
                       weights=word_embeddings
)

device = torch.device('cuda:0' if torch.cuda.is_available()
                      else 'cpu')
    
model.to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.CrossEntropyLoss()
    
train_loss, train_acc, val_loss, val_acc = train_model(model=model,
                                                       train_iter=train_iter,
                                                       val_iter=val_iter,
                                                       optim=optim,
                                                       loss=loss,
                                                       num_epochs=20,
                                                       batch_size=batch_size)

Train Epoch: 0, Training Loss: 0.6742, Training Accuracy:  59.07%
Vadlidation Epoch: 0, Training Loss: 0.6501, Training Accuracy:  62.10%
Validation loss decreased (inf --> 0.650052)
Train Epoch: 1, Training Loss: 0.6157, Training Accuracy:  67.85%
Vadlidation Epoch: 1, Training Loss: 0.6119, Training Accuracy:  69.59%
Validation loss decreased (0.650052 --> 0.611857)
Train Epoch: 2, Training Loss: 0.5690, Training Accuracy:  74.28%
Vadlidation Epoch: 2, Training Loss: 0.5662, Training Accuracy:  74.56%
Validation loss decreased (0.611857 --> 0.566180)
Train Epoch: 3, Training Loss: 0.5414, Training Accuracy:  76.64%
Vadlidation Epoch: 3, Training Loss: 0.5443, Training Accuracy:  76.72%
Validation loss decreased (0.566180 --> 0.544306)
Train Epoch: 4, Training Loss: 0.5266, Training Accuracy:  78.24%
Vadlidation Epoch: 4, Training Loss: 0.5384, Training Accuracy:  77.20%
Validation loss decreased (0.544306 --> 0.538442)
Train Epoch: 5, Training Loss: 0.5092, Training Accuracy:  80.05%

In [125]:
results_target = list()

with torch.no_grad():
    for batch in test_iter:
        for text, idx in zip(batch.text[0], batch.id):
            text = text.unsqueeze(0)
            res, _ = model(text, hidden=None)

            target = np.round(res.cpu().numpy())
            
            results_target.append(target[0][1])


In [128]:
smaple['target'] = list(map(int, results_target))

In [129]:
smaple.to_csv('submission.csv', index=False)