In [2]:
# Keras Core, multi-backend version of Keras.
# Backend is TensorFlow
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import numpy as np 
import pandas as pd 
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

[0mUsing TensorFlow backend
TensorFlow version: 2.16.1
KerasNLP version: 0.15.1


Input contains:
id
keyword: A keyword from that twee)
location: The location the tweet was sent frk)
text: The text of a tweet
target: 1 if the tweet is a real disaster or 0 if not

In [5]:
#read input
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
df_train.head()
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
#Data information
df_train["length"] = df_train["text"].apply(lambda x : len(x))
df_test["length"] = df_test["text"].apply(lambda x : len(x))

print("Train Length Stat")
print(df_train["length"].describe())
print()

print("Test Length Stat")
print(df_test["length"].describe())

Train Length Stat
count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: length, dtype: float64

Test Length Stat
count    3263.000000
mean      102.108183
std        33.972158
min         5.000000
25%        78.000000
50%       109.000000
75%       134.000000
max       151.000000
Name: length, dtype: float64


In [7]:
#Preprocess Data

#BATCH_SIZE: Number of examples per training batch. How many samples processed at a time during training.
BATCH_SIZE = 32

#Total number of rows in the training DataFrame
NUM_TRAINING_EXAMPLES = df_train.shape[0]

#80% of data for training, 20% for validation
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.2

#number of batches per epoch = total num / batch size
STEPS_PER_EPOCH = int(NUM_TRAINING_EXAMPLES)*TRAIN_SPLIT // BATCH_SIZE

#2 full passes through data
EPOCHS = 2

#enables automatic parallelism
AUTO = tf.data.experimental.AUTOTUNE

#train_test_split Splits X and y into training and validation sets
from sklearn.model_selection import train_test_split

#extracts text from input = features
X = df_train["text"]

#extracts target = labels
y = df_train["target"]

#split using ratio assigned above
#fixed random seed for reproducibility
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VAL_SPLIT, random_state=42)

#extract text from test data
X_test = df_test["text"]

In [8]:
#Load a DistilBERT model from Keras NLP

#Text inputs need to be transformed to numeric token ids and arranged in several Tensors before being input

#Preprocessor layer: automatically apply preprocessing to raw inputs during fit(), predict(), and evaluate()
#Done by default with  from_preset()

# Load a DistilBERT model
# english lowercased format (case insensitive)
# backbone architecture and pre-trained weights
preset= "distil_bert_base_en_uncased"

# preprocessor: Handles text tokenization and input preparation
# Converts text into numeric token ID(required inputs).
# Automatically applied to raw inputs during fit(), predict(), and evaluate() using from_preset() 
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,#sets tokenizing rules
                                                                    sequence_length=160, #limits length of tokenized sequence
                                                                   name="preprocessor_4_tweets"
                                                                  )
# Pretrained classifier for text classification tasks
# Links the preprocessor to ensure the input text is correctly tokenized
# Classification task has 2 output classes (binary)
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()



In [None]:
from tensorflow.keras.optimizers import Adam  # Correct import for keras_core
from tensorflow.keras.losses import SparseCategoricalCrossentropy
# Train your own model, fine-tuning BERT

# Compile
classifier.compile(
    #loss function
    #Used for multi-class classification tasks where labels are integers
    #from_logits=True : Internally applies softmax before computing the cross-entropy
    #Sparse? handles labels directly as integers, instead of requiring one-hot encoding
    loss=SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    
    #very small learning rate to avoid large updates during fine-tuning, pre-trained BERT weights are sensitive to large changes
    optimizer=Adam(1e-5),

    #Tracks the percentage of correctly predicted examples during training and validation
    metrics= ["accuracy"]  
)

# Fitting model
#
history = classifier.fit(x=X_train, #traning features set 
                         y=y_train, #target labels
                         batch_size=BATCH_SIZE, # number of samples processed at a time
                         epochs=EPOCHS, #passes through data
                         validation_data=(X_val, y_val) #set of data to evaluate after each epoch
                        )

Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3621s[0m 19s/step - accuracy: 0.7007 - loss: 0.5737 - val_accuracy: 0.8431 - val_loss: 0.3916
Epoch 2/2
[1m 38/191[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m44:28[0m 17s/step - accuracy: 0.8289 - loss: 0.3885

In [3]:
#plot confusion matrix
#y_true : true class labels for dataset
#y_pred : predicted probability
def displayConfusionMatrix(y_true, y_pred, dataset):
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true,
        #Converts y_pred into discrete class predictions (index of the max prob for each instance)
        np.argmax(y_pred, axis=1),
        display_labels=["Not Disaster","Disaster"],
        cmap=plt.cm.Blues
    )

    #Computes the confusion matrix from y_true and the model's predicted classes
    #Flattens the confusion matrix into four key components : tn (true negative), fp(false positives), fn(false negatives), tp(true positives)
    tn, fp, fn, tp = confusion_matrix(y_true, np.argmax(y_pred, axis=1)).ravel()
    
    #f1 score is harmonic mean of precision and recall
    f1_score = tp / (tp+((fn+fp)/2))

    disp.ax_.set_title("Confusion Matrix on " + dataset + " Dataset -- F1 Score: " + str(f1_score.round(2)))

In [None]:
y_pred_train = classifier.predict(X_train)

displayConfusionMatrix(y_train, y_pred_train, "Training")

In [None]:
y_pred_val = classifier.predict(X_val)

displayConfusionMatrix(y_val, y_pred_val, "Validation")

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

In [None]:
#max prob for each instance
sample_submission["target"] = np.argmax(classifier.predict(X_test), axis=1)

In [None]:
sample_submission.describe()

In [None]:
sample_submission.to_csv("submission.csv", index=False)