In [1]:
!pip uninstall -y tensorflow keras-nlp
!pip install tensorflow==2.17 keras-nlp==0.5.0

Found existing installation: tensorflow 2.17.1
Uninstalling tensorflow-2.17.1:
  Successfully uninstalled tensorflow-2.17.1
[0mCollecting tensorflow==2.17
  Downloading tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting keras-nlp==0.5.0
  Downloading keras_nlp-0.5.0-py3-none-any.whl.metadata (5.7 kB)
Collecting tensorflow-text (from keras-nlp==0.5.0)
  Downloading tensorflow_text-2.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
INFO: pip is looking at multiple versions of tensorflow-text to determine which version is compatible with other requirements. This could take a while.
  Downloading tensorflow_text-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
  Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tensorflow-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (601.3 M

In [110]:
import tensorflow as tf
from tensorflow.keras.layers import *
import keras_nlp
import re
import os
from tensorflow import keras
from tensorflow.keras.models import Model
from google.colab import files
import pandas as pd
from tensorflow.keras.layers import Input, Embedding, Dropout, LayerNormalization, Dense, GlobalAveragePooling1D, BatchNormalization, MultiHeadAttention, Bidirectional, LSTM
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import requests
import zipfile
from tensorflow import keras
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Constants
MAX_WORDS = 10000
MAX_LEN = 512
EMBEDDING_DIM = 300

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Create Nessacry Functions

In [111]:
# Download GloVe and create embedding matrix
def download_glove_embeddings():
    if not os.path.exists('glove.6B.100d.txt'):
        print("Downloading GloVe embeddings...")
        url = 'http://nlp.stanford.edu/data/glove.6B.zip'
        response = requests.get(url)
        with open('glove.6B.zip', 'wb') as f:
            f.write(response.content)
        with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
            zip_ref.extractall()
        os.remove('glove.6B.zip')
        print("Download complete!")

def create_embedding_matrix(word_index):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(f'glove.6B.{EMBEDDING_DIM}d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print("Creating embedding matrix...")
    vocab_size = min(MAX_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i >= MAX_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix, vocab_size


download_glove_embeddings()

#Data preprocessing
def preprocessing(text):
    #remove URL
    #text = re.sub(r'http\S+|www\S+', '', str(text))
    #remove arabic words
    text = ' '.join([word for word in text.split() if not re.match(r'[\u0600-\u06FF]', word)])
    #remove special characters & puncituations
    text = re.sub(r'[^A-Za-z0-9\s]', '',str(text))
    text = re.sub(r'\s+', ' ', text)
    #remove stop words
    words = text.split()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in words if word.lower() not in stop_words])
    #Get the word's lemma
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text.strip()

#Get the Positional Encoding Vector
def get_positional_encoding(max_len, d_model):
    positions = np.arange(max_len)[:, np.newaxis]
    dimensions = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (dimensions // 2)) / d_model)
    angle_rads = positions * angle_rates

    pos_encoding = np.zeros(angle_rads.shape)
    pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)
    return tf.expand_dims(pos_encoding, 0)

# Apply Text Preprocessing and Embidding



In [112]:
# Load and preprocess data
print("Loading and preprocessing data...")
data = pd.read_csv("train.csv")
data = data.drop('SampleID', axis=1)
data = data.dropna(subset=['Discussion'])
#data['Discussion'] = data.apply(lambda row: f"This is a {row['Category']} text." if pd.isnull(row['Discussion']) else row['Discussion'], axis=1)
data['Discussion'] = data['Discussion'].apply(preprocessing)

# Tokenization
print("Tokenizing text...")
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(data['Discussion'])
sequences = tokenizer.texts_to_sequences(data['Discussion'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding='post')

# Category mapping
category_map = {"Politics": 0, "Sports": 1, "Media": 2, "Market & Economy": 3, "STEM": 4}
labels = data['Category'].map(category_map).values

# Download GloVe and create embedding matrix
download_glove_embeddings()
embedding_matrix, vocab_size = create_embedding_matrix(tokenizer.word_index)

# Create datasets
print("Creating datasets...")
dataset = tf.data.Dataset.from_tensor_slices((padded_sequences, labels))
dataset = dataset.shuffle(buffer_size=2048)


#######################################################################################################################

Loading and preprocessing data...
Tokenizing text...
Loading GloVe embeddings...
Creating embedding matrix...
Creating datasets...


# Build the Transformer Model

In [None]:
#Build the transformer model
def transformer_model(vocab_size, embedding_matrix):
    inputs = Input(shape=(MAX_LEN,))

    # Embedding layer with pretrained weights
    x = Embedding(
        vocab_size,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        trainable=False
    )(inputs)
    x = Dropout(0.2)(x)

    # Add positional encoding
    pos_encoding = get_positional_encoding(MAX_LEN, EMBEDDING_DIM)
    x = x + pos_encoding[:, :MAX_LEN, :]

    # First transformer block
    attention = MultiHeadAttention(
        num_heads=20,
        key_dim=200,
        dropout=0.5
    )(x, x, x)
    attention = Dropout(0.1)(attention)
    x = LayerNormalization(epsilon=1e-6)(attention + x)

    # Feed Forward
    
    # Global pooling and classification

    return Model(inputs, outputs)

# Training and Validatinig the Model