In [47]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [48]:
#Objective: To understand the usage of nltk and spacy libraries with respect to categorising tweets as disaster or not.
#Data used: From Kaggle competitions "getting started with NLP", "NLP with Disaster tweets"

In [78]:
tweets = pd.read_csv(r"C:\Users\kamalam.s\Desktop\kamalam's\nlp dev\data\disaster tweets\train.csv")
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [79]:
len(tweets)

7613

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

tweets['spacy objects'] = tweets['text'].apply(nlp)

for doc in tweets['spacy objects']:
    x = doc[0].text
    print(x)

In [81]:
#Target Distribution
tweets[tweets['target']==1].count()

id               3271
keyword          3229
location         2196
text             3271
target           3271
spacy objects    3271
dtype: int64

In [82]:
tweets[tweets['target']==0].count()

id               4342
keyword          4323
location         2884
text             4342
target           4342
spacy objects    4342
dtype: int64

In [83]:
import re
#pre-processing the texts
def preprocess_text(text):
    #remove special characters
    text = re.sub(r'@\w+', '', text) #removes @kamalam
    text = re.sub(r'#\w+', '', text) #removes #kamalam
    text = re.sub(r'[^a-zA-Z\s]', '', text) #removes any character other than alphabets and whitspace
    text = re.sub(r'http\S+', '', text) #removes "https://kamalamsivakumar.com"
    return text

In [84]:
#sample preprocess
preprocess_text("https://www.kaggle.com/code/zinebelhouz/nlp-with-disaster-tweets-beginner-friendly/notebook#RNN")

''

In [85]:
#NLTK based pre-processing

In [86]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [87]:
#nlp preprocessing
def nlp_preprocess_stem(text):
    text = preprocess_text(text) #preliminary preprocessing
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    processed_text = ' '.join(tokens)
    return processed_text

In [88]:
nlp_preprocess_stem('@kartikmention is in an earthquake,#hastag URL : https://www.kaggle.com')

'earthquak url'

In [89]:
def nlp_preprocess_lemma(text):
    text = preprocess_text(text) #preliminary preprocessing
    text = text.lower()
    
    tokens = word_tokenize(text)
    tokens = [re.sub(r'[^a-zA-Z0-9]', '', token) for token in tokens if token.strip()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    pos_tags = []
    for token in tokens:
        if token:
            pos_tags.append(nltk.pos_tag([token])[0])
    
    return pos_tags

In [90]:
ans = nlp_preprocess_lemma('@kartikmention is in an earthquake,#hastag URL : https://www.kaggle.com')

In [91]:
ans[0:]

[('earthquake', 'NN'), ('url', 'NN')]

In [92]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = []
for token, pos in ans[0:]:
    pos = pos[0].lower()  
    pos = pos if pos in ['a', 's', 'r', 'n', 'v'] else 'n' 
    lemmatized_token = lemmatizer.lemmatize(token, pos=pos)
    lemmatized_tokens.append(lemmatized_token)

# Join tokens back into a single string
processed_text = ' '.join(lemmatized_tokens)

In [93]:
processed_text

'earthquake url'

In [94]:
#spaCy based preprocessing

In [95]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_preprocess(text):
    text = preprocess_text(text)
    text = text.lower()
    
    doc = nlp(text)

    lemmatized_tokens = [token.lemma_ for token in doc if not token.is_stop]
    processed_text = ' '.join(lemmatized_tokens)
    
    return processed_text

In [96]:
spacy_preprocess('@kartikmention is in an earthquake,#hastag URL : https://www.kaggle.com')

'  earthquake url  '

In [97]:
tweets['processed_text'] = [spacy_preprocess(text) for text in tweets['text']]
tweets.head()

Unnamed: 0,id,keyword,location,text,target,spacy objects,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"(Our, Deeds, are, the, Reason, of, this, #, ea...",deed reason allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,"(Forest, fire, near, La, Ronge, Sask, ., Canada)",forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,"(All, residents, asked, to, ', shelter, in, pl...",resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"(13,000, people, receive, #, wildfires, evacua...",people receive evacuation order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"(Just, got, sent, this, photo, from, Ruby, #, ...",get send photo ruby smoke pour school


In [98]:
tweets_test = pd.read_csv(r"C:\Users\kamalam.s\Desktop\kamalam's\nlp dev\data\disaster tweets\test.csv")
tweets_test['processed_text'] = [spacy_preprocess(text) for text in tweets_test['text']]
tweets_test.head()

Unnamed: 0,id,keyword,location,text,processed_text
0,0,,,Just happened a terrible car crash,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",hear different city stay safe
2,3,,,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese flee street save
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse light
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan


In [99]:
y_train = tweets['target']
X_train = tweets['processed_text']
X_test = tweets_test['processed_text']

In [103]:
X_train

0                             deed reason   allah forgive
1                   forest fire near la ronge sask canada
2       resident ask shelter place notify officer evac...
3            people receive   evacuation order california
4               get send photo ruby   smoke   pour school
                              ...                        
7608         giant crane hold bridge collapse nearby home
7609       control wild fire california northern state...
7610                           m   utckm s volcano hawaii
7611    police investigate ebike collide car little po...
7612    late home raze northern california wildfire   ...
Name: processed_text, Length: 7613, dtype: object

In [27]:
#Multinomial NB classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

nb = MultinomialNB(alpha = 0.1)

In [28]:
tfidf = TfidfVectorizer() #returns tfidf matrix for a raw document/text
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.fit_transform(X_test)

In [29]:
print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_train shape:", X_train.shape)

X_train_tfidf shape: (7613, 11092)
X_train shape: (7613,)


In [30]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []

for fold, (train_index, test_index) in enumerate(stratified_kfold.split(X_train_tfidf, y_train)):
    print(f'Training Model for Fold {fold+1}')

    X_fold_train, X_fold_test = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_fold_train, y_fold_test = y_train.iloc[train_index] , y_train.iloc[test_index]

    nb.fit(X_fold_train, y_fold_train)
    predictions = nb.predict(X_fold_test)

    f1 = f1_score(y_fold_test, predictions)
    f1_scores.append(f1)
    print(f'Fold{fold+1} F1-score : {f1}')

print()
print(f"Average F1-score across all folds: {np.mean(f1_scores)}")

Training Model for Fold 1
Fold1 F1-score : 0.7421686746987952
Training Model for Fold 2
Fold2 F1-score : 0.7531847133757962
Training Model for Fold 3
Fold3 F1-score : 0.7240829346092504
Training Model for Fold 4
Fold4 F1-score : 0.7256347256347256
Training Model for Fold 5
Fold5 F1-score : 0.7389558232931727

Average F1-score across all folds: 0.7368053743223479


In [31]:
nb.fit(X_train_tfidf[:, :6748], y_train)
y_test = nb.predict(X_test_tfidf)

In [None]:
#include id in the test dataset
for i in range(len(y_test)):
    print(i,"\t", y_test[i])

In [54]:
#Using a pre-trained model for text classification
#Fine-Tuning BERT for text classification
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [110]:
tweet = X_train
labels = y_train

In [105]:
print(' Original: ', tweet[0])
# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(tweet[0]))
# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweet[0])))

 Original:  deed reason   allah forgive
Tokenized:  ['deed', 'reason', 'allah', 'forgive']
Token IDs:  [15046, 3114, 16455, 9641]


In [106]:
max_len = 0
for sent in tweet:
    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

Max sentence length:  40


In [112]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

input_ids = []
attention_masks = []
for tweets in tweet:
    encoded_dict = tokenizer.encode_plus(
                        tweets,                     # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           
                        pad_to_max_length = True,  # Pad & truncate all sentences.
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pyTorch tensors. "tf" for tensorflow tensors
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', tweet[0])
print('Token IDs:', input_ids[0])

Original:  deed reason   allah forgive
Token IDs: tensor([  101, 15046,  3114, 16455,  9641,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [113]:
dataset = TensorDataset(input_ids, attention_masks, labels)

#80%-20% train-validation split.
# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

6,090 training samples
1,523 validation samples


In [114]:
#batch_size for training
batch_size = 32
 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [116]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [117]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [118]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [119]:
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [120]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [121]:
def format_time(elapsed):
    #Takes a time in seconds and returns a string hh:mm:ss
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [127]:
import random
import time
import datetime
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    #Validation Set
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.28
  Training epoch took: 0:50:21

Running Validation...
  Accuracy: 0.83

Training...

  Average training loss: 0.31
  Training epoch took: 0:36:39

Running Validation...
  Accuracy: 0.83

Training...

  Average training loss: 0.28
  Training epoch took: 0:34:21

Running Validation...
  Accuracy: 0.83

Training complete!
Total training took 2:11:32 (h:mm:ss)


In [128]:
model = torch.load('bert_model')

In [130]:
test_input_ids = []
test_attention_masks = []
for tweet in X_test:
    encoded_dict = tokenizer.encode_plus(
                        tweet,                     
                        add_special_tokens = True, 
                        max_length = max_len,           
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

In [131]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [132]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():        
            output= model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()
            
            predictions.extend(list(pred_flat))

In [133]:
df_output = pd.DataFrame()
df_output['id'] = tweets_test['id']
df_output['target'] = predictions
df_output

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
