<a href="https://colab.research.google.com/github/Mark-Barbaric/Kaggle/blob/AP-36-Fine-tune-BERT-Pretrained-Model-for-Disaster-Tweets/disaster_tweets/NLP_With_Disaster_Tweets_Pretrained_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nlp-getting-started:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F17777%2F869809%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240724%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240724T071108Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D20d6ecea972c4dc584acdf62d0e2e81c3e8fff0e379f16deacef5ca2594a0d7910f1eac7339d38c8bda1f99910cab2fb6a7e21948f183ae1b372f5feeabcd4c276fcb537fd4842547248104f1dec0c3b4c73580c374e73e635d2cb0f725ce1279a68d817fcaf6394ecda5d40fdf029ad537a88665ac7af93925f67ed34b5f4e4ba002dd6cc7318e4595ffa6dda69f0cedc7f2754723b5fbf4a353fc52b1986b45ffafdbcc3c692ae53e4e710dcb5014f8cc768a28dc3db05796907940507b41e95ac384039d1b7872b0cf0a9ab9051c5f89d25f2327e71dfe60f75559864e5c9238a9454c23fc58b1f1f5a93e6652154f5f1918de20b77ef2b380f520a15229c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nlp-getting-started, 607343 bytes compressed
Downloaded and uncompressed: nlp-getting-started
Data source import complete.


# Disaster Tweets Using KerasNLP and Pretrained BERT

![image.png](attachment:image.png)

In [2]:
import os

In [3]:
IS_KAGGLE = os.path.exists('/kaggle/input')
print(IS_KAGGLE)

True


If this is being run in Kaggle then the dependencies will need to be installed directly into the image.

In [4]:
if IS_KAGGLE:
    print("Installing additional libs")
    !pip install keras-core --upgrade
    !pip install -q keras-nlp --upgrade
    !pip install nltk tweet-preprocessor

Installing additional libs
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [5]:
os.environ['KERAS_BACKEND'] = 'tensorflow'

## Lib Imports

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
import seaborn as sns
import matplotlib.pyplot as plt

Using TensorFlow backend


In [8]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

ENGLISH_STOPWORDS = set(stopwords.words('english'))

# Local Helper Method Imports

Adding local path to sys path to add lib

In [10]:
import sys
from pathlib import Path

current_path = Path.cwd()
print(f"current_path: {current_path}")
sys.path.append(current_path)

current_path: /content


In [11]:
#from disaster_tweet_helpers import preprocess_text

In [12]:
import preprocessor as tweet_preprocessor
import string


def preprocess_text(text, stopwords):
    """_summary_

    Args:
        df (_type_): _description_
        column_name (_type_): _description_
        stopwords (_type_): _description_
    """
    def remove_punctuations(text: str):
        for punctuation in string.punctuation:
            text = text.replace(punctuation, '')
        return text

    text = text.lower()
    text = remove_punctuations(text)
    text = text.replace('\s\s+', ' ')
    text = tweet_preprocessor.clean(text)
    text = ' '.join([w for w in text.split(' ') if w not in stopwords])
    return text

# Constants

In [13]:
RANDOM_STATE = 42
TEST_SIZE = 0.2
BATCH_SIZE = 32
Y_COLUMN = 'target'

## 1. Dataset Loading and Analysis

### 1.1 Train and Test Dataset Loading

In [16]:
TRAIN_DF_DIR = '/kaggle/input/nlp-getting-started/train.csv' if IS_KAGGLE else 'train.csv'
TEST_DF_DIR = '/kaggle/input/nlp-getting-started/test.csv' if IS_KAGGLE else 'test.csv'

In [17]:
train_df = pd.read_csv(TRAIN_DF_DIR)
test_df = pd.read_csv(TEST_DF_DIR)

In [None]:
train_df.head()

In [None]:
test_df.head()

### 1.2 EDA

In [None]:
train_df['preprocessed_text'] = train_df['text'].apply(lambda x: preprocess_text(x, ENGLISH_STOPWORDS))
train_df.head()

In [None]:
train_df['length'] = train_df['preprocessed_text'].apply(lambda x: len(x.split(' ')))
train_df.head()

In [None]:
train_df['length'].hist(bins=20, figsize=(6, 6))

In [None]:
train_df['length'].describe()

In [None]:
X = train_df['preprocessed_text']
y = train_df[Y_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

## 2. Model Loading and Fine Tuning

In [None]:
preset = 'distil_bert_base_en_uncased'
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset, num_classes=2)
classifier.summary()

### 2.2 Model Fine Tuning

In [None]:
classifier.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(1e-5),
    metrics=['accuracy']
)

history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=2,
                         validation_data=(X_test, y_test),
                         verbose=True)