# Natural Language Processing with Disaster Tweets

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified. If this is your first time working on an NLP problem, we've created a quick tutorial to get you up and running.

Different things to keep in mind compare to main.ipynb:
- Use all the columns
- Processing pipeline (lowercasing, stopword removal, punctuation removal, lemmatization, tokenization, and padding)
- Use ML classification algorithms

In [1]:
import pandas as pd

import numpy as np

import os

import re

import spacy

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
kaggle_run = True
if kaggle_run:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
    test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
else:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    submission = pd.read_csv('data/sample_submission.csv')

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Exploratory data analysis

## Preprocessing

In [3]:
def preprocessing(df):
    df.fillna('', inplace=True)
    
    df['combined_text'] = df['keyword'] + ' ' + df['location'] + ' ' + df['text']
    df = df.drop(['id','keyword','location','text'], axis=1)
    return df

In [4]:
train = preprocessing(train)

In [5]:
# ----- Train preprocess ------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_data = tokenizer(
    train['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

labels = train['target'].tolist()

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data), 
    labels  # Corresponding labels
))

train_dataset = train_dataset.shuffle(len(train)).batch(16)



# ----- Test preprocess ------

test = preprocessing(test)

test = tokenizer(
    test['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

test = tf.data.Dataset.from_tensor_slices((dict(test)))
test = test.batch(16)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Build model

In [8]:
# Load BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
model.fit(train_dataset, epochs=10)

Epoch 1/10
Cause: for/else statement not yet supported


I0000 00:00:1725296783.571265     106 service.cc:145] XLA service 0x7e36f07cd710 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1725296783.571325     106 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1725296783.571331     106 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1725296783.748529     106 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7e37477ae110>

## Prediction on new data 

In [14]:
predictions = model.predict(test)
print(f"Direct model predictions: \n {predictions}")

probabilities = tf.nn.softmax(predictions.logits, axis=-1)
print(f"Probabilities \n {probabilities}")

predicted_classes = tf.argmax(probabilities, axis=1).numpy()
#final_predictions = tf.where(probabilities >= 0.5, 1.0, 0.0)

print(f"Predicted classes \n {predicted_classes}")

Direct model predictions: 
 TFSequenceClassifierOutput(loss=None, logits=array([[ 0.14093712, -0.07750207],
       [ 0.14093712, -0.07750208],
       [ 0.14093712, -0.07750207],
       ...,
       [ 0.14093712, -0.07750209],
       [ 0.1409371 , -0.07750207],
       [ 0.1409371 , -0.07750208]], dtype=float32), hidden_states=None, attentions=None)
Probabilities 
 [[0.5543937  0.44560632]
 [0.5543937  0.44560632]
 [0.5543937  0.44560632]
 ...
 [0.5543937  0.44560632]
 [0.5543937  0.44560632]
 [0.5543937  0.44560632]]
Predicted classes 
 [0 0 0 ... 0 0 0]


## Prepare upload

In [18]:
choosen_model_name = 'bert_e10'
choosen_model_predictions = predicted_classes

now = datetime.now()
date_time_str = now.strftime("%Y%m%d_%H%M%S")

if kaggle_run:
    submission = pd.DataFrame({
        'id': pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')['id'],
        'target': choosen_model_predictions
    })

    submission.to_csv(f'/kaggle/working/submission_{choosen_model_name}_{date_time_str}.csv', index=False)
else:
    submission = pd.DataFrame({
        'id': pd.read_csv('data/test.csv')['id'],
        'target': choosen_model_predictions
    })
    submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)