# Natural Language Processing with Disaster Tweets

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified. If this is your first time working on an NLP problem, we've created a quick tutorial to get you up and running.

Different things to keep in mind compare to main.ipynb:
- Use all the columns
- Processing pipeline (lowercasing, stopword removal, punctuation removal, lemmatization, tokenization, and padding)
- Use ML classification algorithms

In [None]:
%pip install contractions

In [3]:
import pandas as pd

import numpy as np

import contractions

import os

import re

import tensorflow as tf

from textblob import TextBlob

from transformers import BertTokenizer, TFBertForSequenceClassification

from sklearn.model_selection import train_test_split

from datetime import datetime

In [4]:
kaggle_run = True
if kaggle_run:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
    test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
else:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    submission = pd.read_csv('data/sample_submission.csv')

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Preprocessing

In [5]:
def expand_contractions(text):
    return contractions.fix(text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def correct_spelling(text):
    return str(TextBlob(text).correct())

def preprocessing(df):
    df.fillna('', inplace=True)
        
    df['combined_text'] = df['keyword'] + ' ' + df['text']
    df = df.drop(['id','keyword','location','text'], axis=1)
        
    df['combined_text'] = df['combined_text'].str.lower()
    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'http[s]?://\S+|www\.\S+','', x))

    df['combined_text'] = df['combined_text'].apply(lambda x: re.sub(r'@[^ \t\n\r\f\v]+', '', x))
    df['combined_text'] = df['combined_text'].str.replace(' @ ', '', regex=False)

    df['combined_text'] = df['combined_text'].apply(expand_contractions)
    
    df['combined_text'] = df['combined_text'].apply(remove_numbers)
    
    df['combined_text'] = df['combined_text'].str.replace('#', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('"', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' "', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' " ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('" ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' | ', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('+', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace('*', '', regex=False)
    df['combined_text'] = df['combined_text'].str.replace(' via ', '', regex=False)
    
    return df

In [6]:
train = preprocessing(train)
test = preprocessing(test)

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['combined_text'].tolist(), 
    train['target'].tolist(), 
    test_size=0.2, 
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # First run ('bert-base-uncased')


train_encodings = tokenizer(
    train_texts,
    padding=True,
    truncation=True, # Cuts off the input text if it exceeds the model maximum input length.
    return_tensors='tf'
)

# Get data in TF format
train_dataset = tf.data.Dataset.from_tensor_slices(( 
    dict(train_encodings),
    train_labels
))

# Schuffle the data and group them in amount of 16 in batches for efficient use.. 
train_dataset = train_dataset.shuffle(len(train_texts)).batch(16)

val_encodings = tokenizer(
    val_texts,
    padding=True,
    truncation=True,
    return_tensors='tf'
)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

val_dataset = val_dataset.batch(16)

model = TFBertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=2)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=3e-5,
    decay_steps=10000,
    decay_rate=0.9
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    train_dataset, 
    validation_data=val_dataset,
    epochs=10,
    callbacks=[early_stopping]
)

test_encodings = tokenizer(
    test['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings)))
test_dataset = test_dataset.batch(16)

predictions = model.predict(test_dataset)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Cause: for/else statement not yet supported


I0000 00:00:1725452751.087094     121 service.cc:145] XLA service 0x7eaa3f3e54d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725452751.087145     121 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1725452751.087149     121 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1725452751.254062     121 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


## Prediction on new data 

In [8]:
predictions = tf.nn.softmax(predictions.logits, axis=-1)
predictions = tf.argmax(predictions, axis=1).numpy()

## Prepare upload

In [9]:
choosen_model_name = 'bert_e20'
choosen_model_predictions = predictions

now = datetime.now()
date_time_str = now.strftime("%Y%m%d_%H%M%S")

if kaggle_run:
    submission = pd.DataFrame({
        'id': pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')['id'],
        'target': choosen_model_predictions
    })

    print(submission['target'].tolist())
    submission.to_csv(f'/kaggle/working/submission.csv', index=False)
else:
    submission = pd.DataFrame({
        'id': pd.read_csv('data/test.csv')['id'],
        'target': choosen_model_predictions
    })
    submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 