# Natural Language Processing with Disaster Tweets

In this competition, you’re challenged to build a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. You’ll have access to a dataset of 10,000 tweets that were hand classified. If this is your first time working on an NLP problem, we've created a quick tutorial to get you up and running.

Different things to keep in mind compare to main.ipynb:
- Use all the columns
- Processing pipeline (lowercasing, stopword removal, punctuation removal, lemmatization, tokenization, and padding)
- Use ML classification algorithms

In [2]:
import pandas as pd

import numpy as np

import os

import re

#from .autonotebook import tqdm as notebook_tqdm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
kaggle_run = True
if kaggle_run:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))
    train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
    test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
else:
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')
    submission = pd.read_csv('data/sample_submission.csv')

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Exploratory data analysis

## Preprocessing

In [4]:
def preprocessing(df):
    df.fillna('', inplace=True)
    
    df['combined_text'] = df['keyword'] + ' ' + df['location'] + ' ' + df['text']
    df = df.drop(['id','keyword','location','text'], axis=1)
    return df

In [5]:
train = preprocessing(train)

In [6]:
# ----- Train preprocess ------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_data = tokenizer(
    train['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

labels = train['target'].tolist()

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(tokenized_data), 
    labels
))

train_dataset = train_dataset.shuffle(len(train)).batch(16)

# ----- Test preprocess ------

test = preprocessing(test)

test = tokenizer(
    test['combined_text'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='tf'
)

test = tf.data.Dataset.from_tensor_slices((dict(test)))
test = test.batch(16)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



## Build model

In [None]:
# Load BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
model.fit(train_dataset, epochs=20)

## Prediction on new data 

In [None]:
predictions = model.predict(test)
print(f"Direct model predictions: \n {predictions}")

probabilities = tf.nn.softmax(predictions.logits, axis=-1)
print(f"Probabilities \n {probabilities}")

predicted_classes = tf.argmax(probabilities, axis=1).numpy()
#final_predictions = tf.where(probabilities >= 0.5, 1.0, 0.0)

print(f"Predicted classes \n {predicted_classes}")

## Prepare upload

In [None]:
choosen_model_name = 'bert_e10'
choosen_model_predictions = predicted_classes

now = datetime.now()
date_time_str = now.strftime("%Y%m%d_%H%M%S")

if kaggle_run:
    submission = pd.DataFrame({
        'id': pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')['id'],
        'target': choosen_model_predictions
    })
    print(submission)
    print(submission['target'])

    submission.to_csv(f'/kaggle/working/submission_{choosen_model_name}_{date_time_str}.csv', index=False)
else:
    submission = pd.DataFrame({
        'id': pd.read_csv('data/test.csv')['id'],
        'target': choosen_model_predictions
    })
    submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)