In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import spacy
import pandas as pd
import numpy as np
from spacy.util import minibatch
from spacy.training.example import Example
import random
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from joblib import dump
#import keras_core as keras

In [3]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [64]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [65]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [66]:
train["keyword"].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [67]:
train["location"].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

In [68]:
nlp = spacy.blank("en")

In [69]:
text_cat = nlp.add_pipe("textcat")

In [70]:
text_cat.add_label("not_disaster")
text_cat.add_label("disaster")

1

In [71]:
train_texts = train["text"].values
train_labels = [{"cats": {"not_disaster": label == 0, "disaster": label == 1}} for label in train["target"]]

In [72]:
train_data = list(zip(train_texts, train_labels))

In [73]:
len(train_data)

7613

In [74]:
0.2 * len(train_data)

1522.6000000000001

In [75]:
NUM_EPOCHS = 50
PATIENCE = 4

In [76]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
last_scores = []
for epoch in range(NUM_EPOCHS):
    random.shuffle(train_data)
    val_data = train_data[:1522]
    train_batches = train_data[1522:]
    batches = minibatch(train_batches, size=10)
    for batch in batches:
        for text, label in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, label)
            nlp.update([example], sgd=optimizer, losses=losses)
    val_docs = [nlp.tokenizer(data[0]) for data in val_data]
    textcat = nlp.get_pipe('textcat')
    probs = textcat.predict(val_docs)
    predicted_labels = probs.argmax(axis=1)
    score = f1_score([1 if data[1]["cats"]["disaster"] else 0 for data in val_data], predicted_labels)
    last_scores.append(score)
    print(f"Loss: {losses['textcat']}, Score: {score}")
    if len(last_scores) == PATIENCE:
        count = 0
        for i in range(PATIENCE - 1):
            if last_scores[0] > last_scores[i + 1]:
                count += 1
        if count == PATIENCE - 1:
            break
        last_scores.pop(0)


[0.6150907354345749]
Loss: 1358.025030974966, Score: 0.6150907354345749
[0.6150907354345749, 0.7142857142857143]
Loss: 2404.181656655929, Score: 0.7142857142857143
[0.6150907354345749, 0.7142857142857143, 0.7752053771471247]
Loss: 3242.1701252296034, Score: 0.7752053771471247
[0.6150907354345749, 0.7142857142857143, 0.7752053771471247, 0.8235294117647058]
Loss: 3955.957848990022, Score: 0.8235294117647058
[0.7142857142857143, 0.7752053771471247, 0.8235294117647058, 0.8546465448768864]
Loss: 4506.599670834137, Score: 0.8546465448768864
[0.7752053771471247, 0.8235294117647058, 0.8546465448768864, 0.8835725677830941]
Loss: 4959.281331146276, Score: 0.8835725677830941
[0.8235294117647058, 0.8546465448768864, 0.8835725677830941, 0.9084139985107967]
Loss: 5328.507772018813, Score: 0.9084139985107967
[0.8546465448768864, 0.8835725677830941, 0.9084139985107967, 0.9179834462001505]
Loss: 5663.874517079755, Score: 0.9179834462001505
[0.8835725677830941, 0.9084139985107967, 0.9179834462001505, 0.

In [77]:
test_docs = [nlp.tokenizer(text) for text in test["text"]]
textcat = nlp.get_pipe('textcat')
probs = textcat.predict(test_docs)
predicted_labels = probs.argmax(axis=1)

In [78]:
predicted_labels

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [79]:
submission = pd.concat([test["id"], pd.Series(predicted_labels)], axis=1)
submission.columns = ["id", "target"]

In [80]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,0
3,9,0
4,11,1


In [81]:
submission["target"].value_counts()

target
0    2042
1    1221
Name: count, dtype: int64

In [82]:
submission.to_csv("data/submission.csv", index=False)

In [87]:
nlp = spacy.load('en_core_web_lg')

In [93]:
with nlp.disable_pipes():
    text_embeddings = np.array([nlp(text).vector for text in train["text"]])
text_embeddings.shape

(7613, 300)

In [94]:
with nlp.disable_pipes():
    keyword_embeddings = np.array([nlp(str(text)).vector for text in train["keyword"]])
keyword_embeddings.shape

(7613, 300)

In [92]:
with nlp.disable_pipes():
    location_embeddings = np.array([nlp(str(text)).vector for text in train["location"]])
location_embeddings.shape

(7613, 300)

In [112]:
embeddings = np.concatenate([text_embeddings, keyword_embeddings, location_embeddings], axis=1)
# embeddings = text_embeddings

In [113]:
embeddings.shape

(7613, 900)

In [114]:
X_train, X_val, y_train, y_val = train_test_split(embeddings, train["target"], test_size=0.1, random_state=1)

In [115]:
cv = StratifiedKFold(5, shuffle=True, random_state=1)
target = train["target"].to_numpy()

In [117]:
run_optimization = False

def objective(trial):
    
    params = {
        'grow_policy': trial.suggest_categorical('grow_policy', ["depthwise", "lossguide"]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'max_depth': trial.suggest_int('max_depth', 0, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
        
    }
    
    params['booster'] = 'gbtree'
    params['objective'] = 'binary:logistic'
    params["device"] = "cpu"
    params["verbosity"] = 0
    params['tree_method'] = "hist"

    cv_splits = cv.split(embeddings, y=train["target"])
    cv_scores = list()

    for train_idx, val_idx in cv_splits:
    
        xgb_model = xgb.XGBClassifier(**params)

        X_train_fold, X_val_fold = embeddings[train_idx], embeddings[val_idx]
        y_train_fold, y_val_fold = target[train_idx], target[val_idx]

        xgb_model.fit(X_train_fold, y_train_fold)

        y_val_pred = xgb_model.predict(X_val_fold)
        cv_scores.append(f1_score(train["target"].iloc[val_idx], y_val_pred))

    cv_evaluation = np.mean(cv_scores)
    
    return cv_evaluation

if run_optimization:
    
    sqlite_db = "sqlite:///sqlite.db"
    study_name = "disaster_tweets_classification"
    study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                                sampler=TPESampler(n_startup_trials=20, multivariate=False, seed=0),
                                direction="maximize", load_if_exists=True)

    study.optimize(objective, n_trials=100)
    best_cls_params = study.best_params
    best_value = study.best_value

else:
    
    best_value = 0.7543939986952639
    best_cls_params = {'grow_policy': 'depthwise', 
                       'n_estimators': 599, 
                       'learning_rate': 0.028803128737724382, 
                       'gamma': 0.22688396307743608, 
                       'subsample': 0.47210935600035175, 
                       'colsample_bytree': 0.6575932611006123, 
                       'max_depth': 6, 
                       'min_child_weight': 3, 
                       'reg_lambda': 1.690636802393145e-08, 
                       'reg_alpha': 3.50367253905031}

print(f"best optmized accuracy: {best_value:0.5f}")
print(f"best hyperparameters: {best_cls_params}")

best_cls_params['objective'] = 'binary:logistic'
best_cls_params['tree_method'] = "hist"
best_cls_params["device"] = "cpu"
best_cls_params["verbosity"] = 0

best optmized accuracy: 0.75439
best hyperparameters: {'grow_policy': 'depthwise', 'n_estimators': 599, 'learning_rate': 0.028803128737724382, 'gamma': 0.22688396307743608, 'subsample': 0.47210935600035175, 'colsample_bytree': 0.6575932611006123, 'max_depth': 6, 'min_child_weight': 3, 'reg_lambda': 1.690636802393145e-08, 'reg_alpha': 3.50367253905031}


In [118]:
xgb_model = xgb.XGBClassifier(**best_cls_params)
xgb_model.fit(X_train, y_train)

In [119]:
f1_score(y_val, xgb_model.predict(X_val))

0.7416107382550335

In [123]:
with nlp.disable_pipes():
    text_embeddings = np.array([nlp(text).vector for text in test["text"]])
    keyword_embeddings = np.array([nlp(str(text)).vector for text in test["keyword"]])
    location_embeddings = np.array([nlp(str(text)).vector for text in test["location"]])

In [127]:
print(text_embeddings.shape)
print(keyword_embeddings.shape)
print(location_embeddings.shape)

(3263, 300)
(3263, 300)
(3263, 300)


In [124]:
test_embeddings = np.concatenate([text_embeddings, keyword_embeddings, location_embeddings], axis=1)

In [125]:
submission = pd.concat([test["id"], pd.Series(xgb_model.predict(test_embeddings))], axis=1)
submission.columns = ["id", "target"]

In [126]:
submission.to_csv("data/submission.csv", index=False)

In [25]:
train["not_disaster"] = train["target"].map({1: 0, 0: 1})
train["disaster"] = train["target"]

In [28]:
train[["not_disaster", "disaster"]].iloc[50:58]

Unnamed: 0,not_disaster,disaster
50,0,1
51,0,1
52,1,0
53,0,1
54,1,0
55,0,1
56,0,1
57,1,0


In [59]:
# X_train, X_val, y_train, y_val = train_test_split(train["text"], train["target"], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(train["text"], train[["not_disaster", "disaster"]], test_size=0.2, random_state=42)

In [60]:
# Token and Encode Function
def tokenize_and_encode(tokenizer, texts, labels=None, max_length=128):
    # Initialize empty lists to store tokenized inputs and attention masks
    input_ids = []
    attention_masks = []

    # Iterate through each comment in the 'comments' list
    for text in texts:
        # Tokenize and encode the comment using the BERT tokenizer
        encoded_dict = tokenizer.encode_plus(
            text,
            # Add special tokens like [CLS] and [SEP]
            add_special_tokens=True,
            # Truncate or pad the comment to 'max_length'
            max_length=max_length,
            # Pad the comment to 'max_length' with zeros if needed
            pad_to_max_length=True,
            # Return attention mask to mask padded tokens
            return_attention_mask=True,
            # Return PyTorch tensors
            return_tensors='pt'
        )

        # Append the tokenized input and attention mask to their respective lists
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Concatenate the tokenized inputs and attention masks into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Convert the labels to a PyTorch tensor with the data type float32
    if labels is not None:
        labels = torch.tensor(labels, dtype=torch.float32)

    # Return the tokenized inputs, attention masks, and labels as PyTorch tensors
    return input_ids, attention_masks, labels

In [61]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [62]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [64]:
# Tokenize and Encode the comments and labels for the training set
input_ids, attention_masks, labels = tokenize_and_encode(
    tokenizer,
    X_train,
    y_train.values
)

# Tokenize and Encode the comments and labels for the validation set
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(
    tokenizer,
    X_val,
    y_val.values
)

# Tokenize and Encode the comments and labels for the test set
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    test['text'],
)

print('Training Texts :',X_train.shape)
print('Input Ids         :',input_ids.shape)
print('Attention Mask    :',attention_masks.shape)
print('Labels            :',labels.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Texts : (6090,)
Input Ids         : torch.Size([6090, 128])
Attention Mask    : torch.Size([6090, 128])
Labels            : torch.Size([6090, 2])


In [65]:
k = 5
print('Training Comments -->>',X_train.values[k])
print('\nInput Ids -->>\n',input_ids[k])
print('\nDecoded Ids -->>\n',tokenizer.decode(input_ids[k]))
print('\nAttention Mask -->>\n',attention_masks[k])
print('\nLabels -->>',labels[k])

Training Comments -->> Wreckage 'Conclusively Confirmed' as From MH370: Malaysia PM: Investigators and the families of those who were... http://t.co/yi54XRHQGB

Input Ids -->>
 tensor([  101, 21056,  1005,  9530, 23633,  2135,  4484,  1005,  2004,  2013,
         1049,  2232, 24434,  2692,  1024,  6027,  7610,  1024, 14766,  1998,
         1996,  2945,  1997,  2216,  2040,  2020,  1012,  1012,  1012,  8299,
         1024,  1013,  1013,  1056,  1012,  2522,  1013, 12316, 27009,  2595,
        25032,  4160, 18259,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0, 

In [66]:
# Creating DataLoader for the balanced dataset
batch_size = 32
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# validation set 
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# testing set 
# test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [67]:
print('Batch Size :',train_loader.batch_size)
Batch =next(iter(train_loader))
print('Each Input ids shape :',Batch[0].shape)
print('Input ids :\n',Batch[0][0])
print('Corresponding Decoded text:\n',tokenizer.decode(Batch[0][0]))
print('Corresponding Attention Mask :\n',Batch[1][0])
print('Corresponding Label:',Batch[2][0])

Batch Size : 32
Each Input ids shape : torch.Size([32, 128])
Input ids :
 tensor([  101, 20772,  3035,  2730,  2006,  2126,  2188,  2013,  1996, 20877,
         2011,  7186,  5380,   999,  1001,  2591,  2638,  9333,  8299,  1024,
         1013,  1013,  1056,  1012,  2522,  1013,  1058,  2213,  3489,  2595,
         3501,  3723,  2290,  2549,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     

In [68]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [69]:
# Function to Train the Model
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs):
    # Loop through the specified number of epochs
    for epoch in range(num_epochs):
        # Set the model to training mode
        model.train()
        # Initialize total loss for the current epoch
        total_loss = 0

        # Loop through the batches in the training data
        for batch in train_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            optimizer.zero_grad()

            outputs = model(
                input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()  # Set the model to evaluation mode
        val_loss = 0

        # Disable gradient computation during validation
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [
                    t.to(device) for t in batch]

                outputs = model(
                    input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()
        # Print the average loss for the current epoch
        print(
            f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)},Validation loss:{val_loss/len(val_loader)}')


# Call the function to train the model
train_model(model, train_loader, val_loader, optimizer, device, num_epochs=3)

Epoch 1, Training Loss: 0.4559187924206569,Validation loss:0.4208747750769059
Epoch 2, Training Loss: 0.3309956993420087,Validation loss:0.38140885904431343
Epoch 3, Training Loss: 0.24313477481378934,Validation loss:0.4788528708741069
Epoch 4, Training Loss: 0.17983919563483816,Validation loss:0.5756213931987683
Epoch 5, Training Loss: 0.13365930649915134,Validation loss:0.5994240629176298
Epoch 6, Training Loss: 0.11110296706956718,Validation loss:0.5934469640875856
Epoch 7, Training Loss: 0.07668575344616986,Validation loss:0.7903188789884249
Epoch 8, Training Loss: 0.06609501547048463,Validation loss:0.739552770430843
Epoch 9, Training Loss: 0.058860252740554,Validation loss:0.8426448286821445
Epoch 10, Training Loss: 0.048459347574581535,Validation loss:0.8041484331091245


In [72]:
# Evaluate the Model
def evaluate_model(model, val_loader, device):
    model.eval()  # Set the model to evaluation mode

    true_labels = []
    predicted_probs = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [t.to(device) for t in batch]

            # Get model's predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            # Use sigmoid for multilabel classification
            predicted_probs_batch = torch.sigmoid(outputs.logits)
            predicted_probs.append(predicted_probs_batch.cpu().numpy())

            true_labels_batch = labels.cpu().numpy()
            true_labels.append(true_labels_batch)

    # Combine predictions and labels for evaluation
    true_labels = np.concatenate(true_labels, axis=0)
    predicted_probs = np.concatenate(predicted_probs, axis=0)
    predicted_labels = (predicted_probs > 0.5).astype(
        int)  # Apply threshold for binary classification

    # Calculate evaluation metrics
    f1 = f1_score(true_labels, predicted_labels, average="macro")

    # Print the evaluation metrics
    print(f'F1 Score: {f1:.6f}')


# Call the function to evaluate the model on the test data
evaluate_model(model, val_loader, device)

F1 Score: 0.800131


In [73]:
# Save the tokenizer and model in the same directory
output_dir = "models/bert"
# Save model's state dictionary and configuration
model.save_pretrained(output_dir)
# Save tokenizer's configuration and vocabulary
tokenizer.save_pretrained(output_dir)

('models/bert\\tokenizer_config.json',
 'models/bert\\special_tokens_map.json',
 'models/bert\\vocab.txt',
 'models/bert\\added_tokens.json')

In [74]:
# Load the tokenizer and model from the saved directory
model_name = "models/bert"
Bert_Tokenizer = BertTokenizer.from_pretrained(model_name)
Bert_Model = BertForSequenceClassification.from_pretrained(
    model_name).to(device)

In [81]:
def predict_texts(texts, model=Bert_Model, tokenizer=Bert_Tokenizer, device=device):
    user_encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

    user_dataset = TensorDataset(user_encodings['input_ids'], user_encodings['attention_mask'])
    user_loader = DataLoader(user_dataset, batch_size=len(texts), shuffle=False)

    model.eval()
    with torch.no_grad():
        for batch in user_loader:
            input_ids, attention_mask = [t.to(device) for t in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.sigmoid(logits)

    predicted_labels = (predictions.cpu().numpy() > 0.5).astype(int)
    return predicted_labels


In [82]:
test["target"] = predict_texts(test["text"].to_list())[:,1]

In [83]:
test

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,,,Storm in RI worse than last hurricane. My city...,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,1


In [84]:
test["target"].value_counts()

target
0    1791
1    1472
Name: count, dtype: int64

In [85]:
submission = test[["id", "target"]]

In [86]:
submission.to_csv("data/submission.csv", index=False)