# Natural Language Processing with Disaster Tweets

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified. If this is your first time working on an NLP problem, we've created a quick tutorial to get you up and running.

Different things to keep in mind compare to main.ipynb:
- Use all the columns
- Processing pipeline (lowercasing, stopword removal, punctuation removal, lemmatization, tokenization, and padding)
- Use ML classification algorithms

In [23]:
import pandas as pd

import numpy as np

import contractions

import os

import re

import tensorflow as tf


from textblob import TextBlob

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

from sklearn.model_selection import train_test_split


from datetime import datetime

In [24]:
kaggle_run = False
if kaggle_run:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
    test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
else:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    submission = pd.read_csv('data/sample_submission.csv')

# Preprocessing

In [25]:
def expand_contractions(text):
    return contractions.fix(text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def preprocessing(df):
    df.fillna('', inplace=True)
    
    # Maybe apply Textblob machine learning algorithm for correction of misspelled words.
        
    df['combined_text'] = df['keyword'] + ' ' + df['text']
    df = df.drop(['id','keyword','location','text'], axis=1)
        
    df['combined_text'] = df['combined_text'].str.lower()
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'http[s]?://\S+|www\.\S+','', x))

    #df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'@\w+', '', x))
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'@[^ \t\n\r\f\v]+', '', x))
    df['combined_text'] = df['combined_text'].str.replace(' @ ', '', regex=False)

    df['combined_text'] = df['combined_text'].apply(expand_contractions)
    
    df['combined_text'] = df['combined_text'].apply(remove_numbers)
    
    df['combined_text'] = df['combined_text'].str.replace('#', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('"', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' "', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' " ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('" ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' | ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('+', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('*', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' via ', '', regex=False)
    
    return df

In [26]:
train = preprocessing(train)
test = preprocessing(test)

In [27]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['combined_text'].tolist(), 
    train['target'].tolist(), 
    test_size=0.2, 
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(
    train_texts,
    padding=True,
    truncation=True,
    return_tensors='tf'
)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

train_dataset = train_dataset.shuffle(len(train_texts)).batch(16)

val_encodings = tokenizer(
    val_texts,
    padding=True,
    truncation=True,
    return_tensors='tf'
)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

val_dataset = val_dataset.batch(16)

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=3e-5,
    decay_steps=10000,
    decay_rate=0.9
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, 
    validation_data=val_dataset,
    epochs=20,
    callbacks=[early_stopping]
)

test_encodings = tokenizer(
    test['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings)))
test_dataset = test_dataset.batch(16)

predictions = model.predict(test_dataset)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




In [28]:
print(predictions)

TFSequenceClassifierOutput(loss=None, logits=array([[-0.57171196,  0.9919148 ],
       [-0.52915317,  0.6366399 ],
       [-0.85720074,  1.1818699 ],
       ...,
       [-1.295325  ,  1.4281175 ],
       [-0.7490389 ,  1.11756   ],
       [-0.0909982 ,  0.02976773]], dtype=float32), hidden_states=None, attentions=None)


# Prepare data

In [11]:
# ----- Train preprocess ------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_data = tokenizer(
    train['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

labels = train['target'].tolist()

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data), 
    labels 
))

train_dataset = train_dataset.shuffle(len(train)).batch(16)

# ----- Test preprocess ------

test = preprocessing(test)

test = tokenizer(
    test['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

test = tf.data.Dataset.from_tensor_slices((dict(test)))
test = test.batch(16)

## Build model

In [8]:
# Load BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
model.fit(train_dataset, epochs=10)

Epoch 1/10
Cause: for/else statement not yet supported


I0000 00:00:1725296783.571265     106 service.cc:145] XLA service 0x7e36f07cd710 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725296783.571325     106 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1725296783.571331     106 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1725296783.748529     106 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7e37477ae110>

## Prediction on new data 

In [29]:
#predictions = model.predict(test)
#print(f"Direct model predictions: \n {predictions}")

probabilities = tf.nn.softmax(predictions.logits, axis=-1)
print(f"Probabilities \n {probabilities}")

predicted_classes = tf.argmax(probabilities, axis=1).numpy()
#final_predictions = tf.where(probabilities >= 0.5, 1.0, 0.0)

print(f"Predicted classes \n {predicted_classes}")

Probabilities 
 [[0.17312683 0.8268731 ]
 [0.23761626 0.76238376]
 [0.1151614  0.8848386 ]
 ...
 [0.06160416 0.9383959 ]
 [0.13393575 0.86606425]
 [0.46984515 0.5301548 ]]
Predicted classes 
 [1 1 1 ... 1 1 1]


## Prepare upload

In [30]:
choosen_model_name = 'bert_e1_new_attempt'
choosen_model_predictions = predicted_classes

now = datetime.now()
date_time_str = now.strftime("%Y%m%d_%H%M%S")

if kaggle_run:
    submission = pd.DataFrame({
        'id': pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')['id'],
        'target': choosen_model_predictions
    })

    submission.to_csv(f'/kaggle/working/submission_{choosen_model_name}_{date_time_str}.csv', index=False)
else:
    submission = pd.DataFrame({
        'id': pd.read_csv('data/test.csv')['id'],
        'target': choosen_model_predictions
    })
    submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)