In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
file_names = []
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        file_names.append(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import required libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle

In [None]:
SEED = 4243

In [None]:
file_names

In [None]:
sample = pd.read_csv(file_names[0])
sample.head(10)

In [None]:
train = pd.read_csv(file_names[1])
train.head()

In [None]:
test = pd.read_csv(file_names[2])
test.head()

# **EDA of the dataset**

In [None]:
# Define  the label
TARGET = 'target'

In [None]:
# Create a function to plot the null values
def plot_null(df):
    print("The number of values in the dataset:", df.shape[0])
    sns.heatmap(df.isnull().sum().to_frame(), annot=True, fmt="d", cmap="crest")
    plt.title("Heatmap of the null values")

In [None]:
print("Null values in the training dataset")
plot_null(train)

In [None]:
print("Number of null values in test data")
plot_null(test)

In [None]:
# Replacing the null values with string 0
train.fillna("0", inplace=True)
test.fillna("0", inplace=True)

In [None]:
# Checking the data for null values
plot_null(train)

In [None]:
plot_null(test)

In [None]:
# Checking the duplicate values
train[train.duplicated()]

#  **Target Analysis**
Even though we know this is a binary target variable, no harm in confirming that

In [None]:
sns.histplot(x=train[TARGET])
plt.title("Checking the distribution of the target value")

Now we can confirm apart from 0, 1 there is no other data present in the target variable

# **Feature Analysis**

In [None]:
# Group up the training dataframe by "keyword" column and 
# count the "target" series group values
keyword = train.groupby("keyword")["target"].count()

# Convert the above groupby object(Pandas core series) to dataframe
keyword_df = pd.DataFrame(data = {"keyword": keyword.index, "count": keyword.values}).sort_values(by=["count"], ascending=False)
keyword_df

In [None]:
# Plot the keyword dataframe
plt.figure(figsize=(12,5))

# Limit the data to the top 25 keyword
sns.barplot(data=keyword_df.head(25), x="keyword", y="count")
plt.xticks(rotation=40)
plt.ylabel('count')
plt.title("Analysing the top 25 words in the tweet")

In [None]:
# Group up the training dataframe by "location" column and 
# count the "target" series group values
location = train.groupby("location")["target"].count()

# Convert the above groupby object(Pandas core series) to dataframe
location_df = pd.DataFrame(data = {"location": location.index, "count": location.values}).sort_values(by=["count"], ascending=False)
location_df

In [None]:
# Plot the keyword dataframe
plt.figure(figsize=(12,5))

# Limit the data to the top 25 keyword
sns.barplot(data=location_df.head(25), x="location", y="count")
plt.xticks(rotation=40)
plt.ylabel('count')
plt.title("Analysing the top 25 locations in the tweet")

In [None]:
# Plot the keyword dataframe
plt.figure(figsize=(12,5))

# Limit the data to the top 25 keyword and discard the first value
sns.barplot(data=location_df.head(25).iloc[1:,:], x="location", y="count")
plt.xticks(rotation=40)
plt.ylabel('count')
plt.title("Analysing the top 25 locations in the tweet")

In [None]:
# Create the functions to get the maximum number of words in each column
def get_max_length_sentence(df, key):
    max_length = 0
    for text in df[key]:
        if len(text) > max_length:
            max_length = len(text)
    print(f"maximum length of the {key} column is: {max_length}")

In [None]:
# Maximum lengths of the columns in train dataset
get_max_length_sentence(train, "text")
get_max_length_sentence(train, "keyword")
get_max_length_sentence(train, "location")

In [None]:
# Maximum lengths of the columns in test dataset
get_max_length_sentence(test, "text")
get_max_length_sentence(test, "keyword")
get_max_length_sentence(test, "location")

# **Data Preprocessing**

In [None]:
train.head()

In [None]:
train.pop("id")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
# shuffle the dataframe
train = shuffle(train, random_state=SEED)
train.head()

In [None]:
BATCH = 32

# convert the train data to tf.data.Dataset object
train_tf = tf.data.Dataset.from_tensor_slices((train["keyword"]+train["location"]+train["text"], train["target"]))

# Convertt it into batch
train_tf = train_tf.shuffle(int((SEED*13)/8)).batch(BATCH)

# convert the test data to tf.data.Dataset object
test_tf = tf.data.Dataset.from_tensor_slices(test["keyword"]+test["location"]+test["text"])

# Convertt it into batch
test_tf = test_tf.batch(BATCH)


# **Text Vectorization**
To enable the model to process the text data, we need to convert it into integer values throguh a process called text vectorization.

In [None]:
max_length = 165
max_tokens = 20_000

# Instantiate the text vectorization layer
text_vect = layers.TextVectorization(max_tokens=max_tokens,output_mode='int', output_sequence_length=max_length)

In [None]:
# Learn the vocabulary
text_vect.adapt(train_tf.map(lambda twt, target: twt))

In [None]:
# Get the vocabulary
vocab = text_vect.get_vocabulary()
print("Size of the vocabulary= ",len(vocab))
vocab = np.array(vocab)

In [None]:
# Vectorize the training dataset
train_tf = train_tf.map(lambda twt, target:(text_vect(twt), target), num_parallel_calls=tf.data.AUTOTUNE)

# Vectorize the etsting dataset
test_tf = test_tf.map(lambda twt:text_vect(twt), num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# Define a function to print the tokenized data
def print_sample(data):
    for sample, target in data:
        #Print the first item
        print("1st sample:",sample[0].numpy())
        print("\n")
        #Print the second item
        print("2nd sample:",sample[1].numpy())
        print("\n")
        #Print the third item
        print("3rd sample:",sample[2].numpy())
        print("\n")
        break

In [None]:
print_sample(train_tf)

In [None]:
#Print the vectorized tweet and the decoded tweet
for tx in train_tf:
    print("\t\t\t\tVectorized Tweet:\n",tx[0][0])
    print("\n\n\t\t\t\tDecoded Tweet:\n", " ".join(vocab[tx[0][0].numpy()]))
    break

# **Model**
To classify the tweets, we will employ a Transformer model specifically designed as an Encoder-only model. In this approach, we will define the Encoder layer and incorporate a Positional Embedding layer using Keras subclassing. By utilizing these components, we aim to enhance the accuracy and effectiveness of our classification model.

Moreover, the Transformer model's architecture enables it to capture contextual relationships and dependencies among words or tokens within the tweets. The Encoder layer acts as a powerful feature extractor, learning representations that contribute to the tweet classification task.

Additionally, the Positional Embedding layer adds crucial positional information to the input tokens, enabling the model to discern the sequential order of words in the tweet. This positional encoding facilitates the Transformer model in capturing long-range dependencies and effectively processing the input text.

By combining the Transformer's robust architecture with Keras subclassing, we can create a powerful and flexible model that yields accurate tweet classification results.

In [None]:
#Define a Transformer Encoder using subclassed layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        #Size of the input vector (size of the vocabulary)
        self.embed_dim = embed_dim
        #Size of the inner dense layer
        self.dense_dim = dense_dim
        #Number of attention heads
        self.num_heads = num_heads

        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
                   [layers.Dense(dense_dim, activation="relu"),
                    layers.Dense(embed_dim),]
                                        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.max_pool1 = layers.GlobalMaxPooling1D()

    #Define a call() method  where forward pass is implemented
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        #Apply the attention layer
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        #Normalize the data
        proj_input = self.layernorm_1(inputs + attention_output)
        #Apply the dense layer
        proj_output = self.dense_proj(proj_input)
        #Normalize the data and return it
        return self.layernorm_2(proj_input + proj_output)
        
        #return self.max_pool1(norm)

    #Define configuration method
    def get_config(self):
        config = super().get_config()
        config.update({
                    "embed_dim": self.embed_dim,
                    "num_heads": self.num_heads,
                    "dense_dim": self.dense_dim,
                    })
        return config

In [None]:
# Implementing positional embedding as a subclassed layer
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.token_embeddings = layers.Embedding(
          input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
                   input_dim=sequence_length, output_dim=output_dim)

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
            return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
           "output_dim": self.output_dim,
           "sequence_length": self.sequence_length,
           "input_dim": self.input_dim,
                    })
        return config

In [None]:
#Construct the model

#Define the input
inputs = keras.Input(shape=(None,), dtype="int64")

#Apply positional embeddings
pos_embed = PositionalEmbedding(sequence_length=165,
                        input_dim=20_000,
                        output_dim=256)(inputs)

#Apply the encoder
encoded = TransformerEncoder(embed_dim=256,
                             dense_dim=32,
                             num_heads=8)(pos_embed)


x = layers.GlobalMaxPooling1D()(encoded)
x = layers.Dropout(0.5)(x)
output = layers.Dense(units=1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs,outputs=output)

In [None]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9),
             loss=tf.keras.losses.BinaryCrossentropy(),
             metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
keras.utils.plot_model(model=model,
                      to_file="model.png",
                      show_shapes=True,
                      show_layer_names=True,
                      expand_nested=True,
                      show_layer_activations=True,
                      show_trainable=True)

In [None]:
# Define the callbacks
callbacks = [keras.callbacks.ModelCheckpoint("tweet_classifier.tf", save_best_only=True)]

In [None]:
# Define the validation data size
val_size = int(0.25*len(train_tf))

In [None]:
# split the data into training and validaation
val_data = train_tf.take(val_size)
train_data = train_tf.skip(val_size)

In [None]:
# train the model
history = model.fit(train_data,
                    epochs=150,
                    validation_data=val_data,
                    callbacks=callbacks)

In [None]:
# Plotting the training and validation loss
loss = history.history["loss"]
val_loss = history.history["val_loss"]
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, "bo", label="Training loss")
plt.plot(epochs, val_loss, "b", label="Validation loss")
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

In [None]:
# Plotting the training and validation accuracy
acc = history.history["accuracy"]
val_acc = history.history["val_accuracy"]
plt.plot(epochs, acc, "bo", label="Training accuracy")
plt.plot(epochs, val_acc, "b", label="Validation accuracy")
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()

In [None]:
#Classify the tweets of test data
predictions = model.predict(test_tf)

In [None]:
#Print the predictions
predictions

In [None]:
threshold = 0.5
#Convert the float values to binary
final_predictions = [1 if i > threshold else 0 for i in predictions]
final_predictions[:10]

In [None]:
sample

In [None]:
submissions = pd.DataFrame(columns=["id","target"])
submissions["target"] = final_predictions
submissions["id"] = test.id
submissions

In [None]:
#Save the file as a csv file
submissions.to_csv("submission.csv", index=False)