In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'arabic-youtube-comments-by-khalaya:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4605500%2F7852763%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240524%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240524T231813Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D85525cb905776cf0a1cdbe2eac8d0031abf975fd145c664935453bd98ea51773a96b18342eac576a8229b4149c579c6cc6087b66f558085bf0c1233beff83e0a2e99b975fde18884a7af19ca7759572e113a53d8850ede5a1fc8640a955d2c8d8afbbda81cd7071ecc381823ff349ef4a2d18341215a546b54b6dd642675f2561eccea88dea7e09b7ee829fd46f1e31f34bdec4f739b6ab2b78581eb931c01d5d69863a890ba44308f997bac8e64f14fe5e6969612c62e5c626c604c9ea9b8970ef48b6c4752d9ad6077fee33250b666fbb1a278c4b1eea71ebf7bb3bb94db25526c32012da98f51ee0b37f1e98f86ea3a84274c5b7583228db8235e57d17c38'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading arabic-youtube-comments-by-khalaya, 7895521 bytes compressed
Downloaded and uncompressed: arabic-youtube-comments-by-khalaya
Data source import complete.


# Import libs

In [None]:
%%capture
!pip install tensorflow==2.15.0
!pip install transformers==4.37.2

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer,TFAutoModelForMaskedLM,TFAutoModelForSequenceClassification
import pandas as pd
import numpy as np
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix,f1_score,classification_report,auc,roc_curve,RocCurveDisplay,precision_score,recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import warnings
# Suppress FutureWarning messages
import logging, os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

In [None]:
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

Mixed precision enabled


In [None]:
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
#     print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Tensorflow version 2.12.0


#Hyperparameters

In [None]:
EPOCHS = 7
LEARNING_RATE_MAX = 2e-5
LEARNING_RATE = 2e-5
PCT = 0.02
BATCH_SIZE = 512
WD = 0.001
MAX_LENGTH = 128
DROP_OUT = 0.1

#Functions

In [None]:
def f_beta_score(y_true, y_pred):
    beta=1
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))

        recall = TP / (Positives+K.epsilon())
        return recall

    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

        precision = TP / (Pred_Positives+K.epsilon())
        return precision

    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)

    return (beta+1)*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def get_ds(data,Xcol,ycol,max_padding,tokenizer, batch_size=32):

    X = data[Xcol]
    y = data[ycol]

    speech_act_encoder = {'Expression':0,'Assertion':1,'Question':2,'Recommendation':3,'Request':4,'Miscellaneous':5}
    sentiment_encoder = {'Positive':0,'Neutral':1,'Negative':2,'Mixed':3}
    y['speech_act'] = y['speech_act'].apply(lambda x:speech_act_encoder[x])
    y['sentiment'] = y['sentiment'].apply(lambda x:sentiment_encoder[x])
    y['sarcasm'] = y['sarcasm'].astype(int)
#     y['dangerous'] = y['dangerous'].astype(int)

    def transform(arr):
        result = np.zeros([arr.shape[0],12])
        arr = arr.values
        result[:,0:4][np.arange(arr.shape[0]), arr[:,0]] = 1
        result[:,4:10][np.arange(arr.shape[0]), arr[:,1]] = 1
        result[:,10:12][np.arange(arr.shape[0]), arr[:,2]] = 1
#         result[:,12:][np.arange(arr.shape[0]), arr[:,3]] = 1
        return result

    y = transform(y)
    assert y.shape[0]*3 == np.sum(y)
    X, y = X.tolist(), y.tolist()

    X = tokenizer(X, truncation=True, padding='max_length',max_length=max_padding)

    data = tf.data.Dataset.from_tensor_slices((
        dict(X),
        y
    ))

    data = data.batch(batch_size)
    return data

In [None]:
class Categorical_loss(tf.keras.losses.Loss):
    def __init__(self,speech_act_alpha,
                 sentiment_alpha,
                 sarcasm_alpha,
                 dangerous_alpha,reduction=tf.keras.losses.Reduction.NONE,
                 name='Categorical_loss',):
        super().__init__(reduction=reduction, name=name)
        # Initialize the loss functions with the specified reduction
        self.cce_sen = tf.keras.losses.CategoricalFocalCrossentropy(reduction=reduction,alpha=sentiment_alpha)
        self.cce_sa = tf.keras.losses.CategoricalFocalCrossentropy(reduction=reduction,alpha=speech_act_alpha)
        self.cce_sar = tf.keras.losses.CategoricalFocalCrossentropy(reduction=reduction,alpha=sarcasm_alpha)
#         self.cce_dan = tf.keras.losses.CategoricalFocalCrossentropy(reduction=reduction,alpha=dangerous_alpha)

    def call(self, y_true, y_pred):
        # Compute the losses for different segments
        sa_loss = self.cce_sa(y_true[:, 4:10], y_pred[:, 4:10])
        sa_loss = tf.reduce_mean(sa_loss)
        sen_loss = self.cce_sen(y_true[:, 0:4], y_pred[:, 0:4])
        sen_loss = tf.reduce_mean(sen_loss)
        sar_loss = self.cce_sar(y_true[:, 10:12], y_pred[:, 10:12])
        sar_loss = tf.reduce_mean(sar_loss)
#         dan_loss = self.cce_dan(y_true[:, 12:], y_pred[:, 12:])
#         dan_loss = tf.reduce_mean(sar_loss)

        # Combine the losses
        total_loss = sa_loss + sen_loss + sar_loss
        return total_loss
class Categorical_loss_sentiment(tf.keras.losses.Loss):
    def __init__(self, reduction=tf.keras.losses.Reduction.NONE, name='Categorical_loss_sentiment'):
        super().__init__(reduction=reduction, name=name)
        # Initialize the loss functions with the specified reduction
        self.cce = tf.keras.losses.CategoricalCrossentropy(reduction=reduction)

    def call(self, y_true, y_pred):
        # Compute the losses for different segments
        sen_loss = self.cce(y_true[:, 0:4], y_pred[:, 0:4])
        sen_loss = tf.reduce_mean(sen_loss)
        return sen_loss

class Categorical_loss_sarcasm(tf.keras.losses.Loss):
    def __init__(self, reduction=tf.keras.losses.Reduction.NONE, name='Categorical_loss_sarcasm'):
        super().__init__(reduction=reduction, name=name)
        # Initialize the loss functions with the specified reduction
        self.cce = tf.keras.losses.CategoricalCrossentropy(reduction=reduction)

    def call(self, y_true, y_pred):
        # Compute the losses for different segments
        sar_loss = self.cce(y_true[:, 10:12], y_pred[:, 10:12])
        sar_loss = tf.reduce_mean(sar_loss)
        return sar_loss

class Categorical_loss_speech_act(tf.keras.losses.Loss):
    def __init__(self, reduction=tf.keras.losses.Reduction.NONE, name='Categorical_loss_speech_act'):
        super().__init__(reduction=reduction, name=name)
        # Initialize the loss functions with the specified reduction
        self.cce = tf.keras.losses.CategoricalCrossentropy(reduction=reduction)

    def call(self, y_true, y_pred):
        # Compute the losses for different segments
        sa_loss = self.cce(y_true[:, 4:10], y_pred[:, 4:10])
        sa_loss = tf.reduce_mean(sa_loss)
        return sa_loss

class Categorical_loss_dangerous(tf.keras.losses.Loss):
    def __init__(self, reduction=tf.keras.losses.Reduction.NONE, name='Categorical_loss_dangerous'):
        super().__init__(reduction=reduction, name=name)
        # Initialize the loss functions with the specified reduction
        self.cce = tf.keras.losses.CategoricalCrossentropy(reduction=reduction)

    def call(self, y_true, y_pred):
        # Compute the losses for different segments
        sa_loss = self.cce(y_true[:, 12:], y_pred[:, 12:])
        sa_loss = tf.reduce_mean(sa_loss)
        return sa_loss


In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import logging

logging.getLogger('tensorflow').setLevel(logging.ERROR)

from tensorflow.keras.callbacks import Callback

class CosineAnnealer:

    def __init__(self, start, end, steps):
        self.start = start
        self.end = end
        self.steps = steps
        self.n = 0

    def step(self):
        self.n += 1
        cos = np.cos(np.pi * (self.n / self.steps)) + 1
        return self.end + (self.start - self.end) / 2. * cos


class OneCycleScheduler(Callback):
    """
    """

    def __init__(self, lr_max, steps, mom_min=0.85, mom_max=0.95, phase_1_pct=0.25, div_factor=2.):
        super(OneCycleScheduler, self).__init__()
        lr_min = lr_max / div_factor
        final_lr = lr_max / (div_factor * 1e2)
        phase_1_steps = steps * phase_1_pct
        phase_2_steps = steps - phase_1_steps

        self.phase_1_steps = phase_1_steps
        self.phase_2_steps = phase_2_steps
        self.phase = 0
        self.step = 0

        self.phases = [[CosineAnnealer(lr_min, lr_max, phase_1_steps), CosineAnnealer(mom_max, mom_min, phase_1_steps)],
                 [CosineAnnealer(lr_max, final_lr, phase_2_steps), CosineAnnealer(mom_min, mom_max, phase_2_steps)]]

        self.lrs = []
        self.moms = []

    def on_train_begin(self, logs=None):
        self.phase = 0
        self.step = 0

        self.set_lr(self.lr_schedule().start)
        self.set_momentum(self.mom_schedule().start)

    def on_train_batch_begin(self, batch, logs=None):
        self.lrs.append(self.get_lr())
        self.moms.append(self.get_momentum())

    def on_train_batch_end(self, batch, logs=None):
        self.step += 1
        if self.step >= self.phase_1_steps:
            self.phase = 1

        self.set_lr(self.lr_schedule().step())
        self.set_momentum(self.mom_schedule().step())

    def get_lr(self):
        try:
            return tf.keras.backend.get_value(self.model.optimizer.lr)
        except AttributeError:
            return None

    def get_momentum(self):
        try:
            return tf.keras.backend.get_value(self.model.optimizer.momentum)
        except AttributeError:
            return None

    def set_lr(self, lr):
        try:
            tf.keras.backend.set_value(self.model.optimizer.lr, lr)
        except AttributeError:
            pass # ignore

    def set_momentum(self, mom):
        try:
            tf.keras.backend.set_value(self.model.optimizer.momentum, mom)
        except AttributeError:
            pass # ignore

    def lr_schedule(self):
        return self.phases[self.phase][0]

    def mom_schedule(self):
        return self.phases[self.phase][1]

    def plot(self):
        ax = plt.subplot(1, 2, 1)
        ax.plot(self.lrs)
        ax.set_title('Learning Rate')
        ax = plt.subplot(1, 2, 2)
        ax.plot(self.moms)
        ax.set_title('Momentum')

In [None]:
def get_model():
    model = TFAutoModelForMaskedLM.from_pretrained("UBC-NLP/MARBERTv2",name='BERT')
    input_ids = tf.keras.Input(shape=(None, ),dtype='int32',name='input_ids')
    token_type_ids = tf.keras.Input(shape=(None, ),dtype='int32',name='token_type_ids')
    attention_mask = tf.keras.Input(shape=(None, ), dtype='int32',name='attention_mask')

    transformer = model(input_ids,attention_mask, token_type_ids,output_hidden_states=True)
    cls = transformer.hidden_states[-1][:,0,:]
    drop_out = tf.keras.layers.Dropout(DROP_OUT)(cls)

    sentiment = tf.keras.layers.Dense(768,activation='relu',name='sentiment')(drop_out)
    sentiment = tf.keras.layers.Dense(768,activation='relu',name='sentiment2')(sentiment)
    sentiment = tf.keras.layers.Dense(768,activation='relu',name='sentiment3')(sentiment)
    sentiment = tf.keras.layers.Dense(4,activation='softmax',name='sentiment_out')(sentiment)

    speech_act = tf.keras.layers.Dense(768,activation='relu',name='speech_act')(drop_out)
    speech_act = tf.keras.layers.Dense(768,activation='relu',name='speech_act2')(speech_act)
    speech_act = tf.keras.layers.Dense(768,activation='relu',name='speech_act3')(speech_act)
    speech_act = tf.keras.layers.Dense(6,activation='softmax',name='speech_act_out')(speech_act)

    sarcasm = tf.keras.layers.Dense(768,activation='relu',name='sarcasm')(drop_out)
    sarcasm = tf.keras.layers.Dense(768,activation='relu',name='sarcasm2')(sarcasm)
    sarcasm = tf.keras.layers.Dense(768,activation='relu',name='sarcasm3')(sarcasm)
    sarcasm = tf.keras.layers.Dense(2,activation='softmax',name='sarcasm_out')(sarcasm)

#     dangerous = tf.keras.layers.Dense(768,activation='relu',name='dangerous')(drop_out)
#     dangerous = tf.keras.layers.Dense(768,activation='relu',name='dangerous2')(dangerous)
#     dangerous = tf.keras.layers.Dense(768,activation='relu',name='dangerous3')(dangerous)
#     dangerous = tf.keras.layers.Dense(2,activation='softmax',name='dangerous_out')(dangerous)

    output = tf.keras.layers.Concatenate(axis = -1)([sentiment,speech_act,sarcasm])

    Fmodel = tf.keras.Model(inputs=[input_ids,token_type_ids, attention_mask], outputs=output)
    return Fmodel

In [None]:
data = pd.read_csv('/kaggle/input/arabic-youtube-comments-by-khalaya/data.csv')

In [None]:
data['speech_act'] = data.speech_act.str.replace("Recomendation","Recommendation")
data['speech_act'] = data.speech_act.str.replace("Recommmendation","Recommendation")
data['speech_act'] = data.speech_act.str.replace("Recommenation","Recommendation")
data['speech_act'] = data.speech_act.str.replace("Experssion","Expression")

In [None]:
speech_act_counts = data.value_counts('speech_act')

In [None]:
data = data[~data.isin(list(speech_act_counts[speech_act_counts < 100].index))]

In [None]:
speech_act_alpha = (1 - (data.value_counts('speech_act') / data.value_counts('speech_act').sum())).tolist()

In [None]:
sentiment_alpha = (1 - (data.value_counts('sentiment') / data.value_counts('sentiment').sum())).tolist()

In [None]:
sarcasm_alpha = (1 - (data.value_counts('sarcasm') / data.value_counts('sarcasm').sum())).tolist()

In [None]:
dangerous_alpha = (1 - (data.value_counts('dangerous') / data.value_counts('dangerous').sum())).tolist()

In [None]:
data = data.dropna(subset='speech_act')

In [None]:
data = data.sample(n = data.shape[0])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['comment'],
                                                    data[['sentiment','speech_act','sarcasm']],
                                                    test_size=0.1,
                                                    random_state=42,
                                                    shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.1,
                                                    random_state=42,
                                                    shuffle=True)

In [None]:
train_tensor = get_ds(pd.concat([X_train,y_train],axis=1),
                      Xcol='comment',
                      ycol=['sentiment','speech_act','sarcasm'],
                      max_padding=MAX_LENGTH,
                      tokenizer=tokenizer,
                      batch_size=BATCH_SIZE)
val_tensor = get_ds(pd.concat([X_val,y_val],axis=1),
                      Xcol='comment',
                      ycol=['sentiment','speech_act','sarcasm'],
                      max_padding=MAX_LENGTH,
                      tokenizer=tokenizer,
                      batch_size=BATCH_SIZE)

In [None]:
with tpu_strategy.scope():
    model = get_model()

lr_schedule = OneCycleScheduler(LEARNING_RATE_MAX, len(train_tensor) * EPOCHS,phase_1_pct=PCT)

optimizer = tf.keras.optimizers.AdamW(learning_rate=LEARNING_RATE,epsilon=1e-8,beta_1=0.9,beta_2=0.999,weight_decay=WD)
model.compile(optimizer = optimizer,
            loss = Categorical_loss(speech_act_alpha=speech_act_alpha,
             sentiment_alpha=sentiment_alpha,
             sarcasm_alpha=sarcasm_alpha,
             dangerous_alpha=dangerous_alpha),
            metrics = [f_beta_score,Categorical_loss_speech_act(),Categorical_loss_sentiment(),Categorical_loss_sarcasm()]
            )
print(model.summary())
model.fit(train_tensor,validation_data = val_tensor,epochs=EPOCHS,callbacks=[lr_schedule])

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unknown node type <gast.gast.ClassDef object at 0x7a64da36d630>
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: Unknown node type <gast.gast.ClassDef object at 0x7a64db0b3370>


TypeError: Exception encountered when calling layer "BERT" (type TFBertForMaskedLM).

in user code:

    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 802, in run_call_with_unpacked_inputs  *
        return func(self, **unpacked_inputs)
    File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 888, in call
        use_cache = False

    TypeError: Exception encountered when calling layer 'bert' (type TFBertMainLayer).
    
    in user code:
    
        File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 802, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/usr/local/lib/python3.10/dist-packages/transformers/models/bert/modeling_tf_bert.py", line 888, in call  **
            use_cache = False
        File "/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py", line 70, in error_handler
            
    
        TypeError: Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).
        
        in user code:
        
        
            TypeError: outer_factory.<locals>.inner_factory.<locals>.tf__build() got an unexpected keyword argument 'position_ids'
        
        
        Call arguments received by layer 'embeddings' (type TFBertEmbeddings):
          • input_ids=tf.Tensor(shape=(None, None), dtype=int32)
          • position_ids=None
          • token_type_ids=tf.Tensor(shape=(None, None), dtype=int32)
          • inputs_embeds=None
          • past_key_values_length=0
          • training=False
    
    
    Call arguments received by layer 'bert' (type TFBertMainLayer):
      • input_ids=tf.Tensor(shape=(None, None), dtype=int32)
      • attention_mask=tf.Tensor(shape=(None, None), dtype=int32)
      • token_type_ids=tf.Tensor(shape=(None, None), dtype=int32)
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=None
      • output_attentions=False
      • output_hidden_states=True
      • return_dict=True
      • training=False


Call arguments received by layer "BERT" (type TFBertForMaskedLM):
  • input_ids=tf.Tensor(shape=(None, None), dtype=int32)
  • attention_mask=tf.Tensor(shape=(None, None), dtype=int32)
  • token_type_ids=tf.Tensor(shape=(None, None), dtype=int32)
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=True
  • return_dict=None
  • labels=None
  • training=False

In [None]:
test_tensor = get_ds(pd.concat([X_test,y_test],axis=1),
                      Xcol='comment',
                      ycol=['sentiment','speech_act','sarcasm'],
                      max_padding=MAX_LENGTH,
                      tokenizer=tokenizer,
                      batch_size=1)

In [None]:
preds = model.predict(test_tensor)

In [None]:
preds = np.stack(list(preds))

In [None]:
test = []
for i in list(test_tensor):
    test.append(i[1].numpy()[0])

In [None]:
test = np.stack(test)

In [None]:
test.shape

In [None]:
sentiment_pred = np.argmax(preds[:,0:4],axis=1)
speech_act_pred = np.argmax(preds[:,4:10],axis=1)
sarcasm_pred = np.argmax(preds[:,10:12],axis=1)
# dangerous_pred = np.argmax(preds[:,12:14],axis=1)

sentiment_test = np.argmax(test[:,0:4],axis=1)
speech_act_test = np.argmax(test[:,4:10],axis=1)
sarcasm_test = np.argmax(test[:,10:12],axis=1)
# dangerous_test = np.argmax(test[:,12:14],axis=1)

In [None]:
print(classification_report(y_true=sentiment_test,y_pred=sentiment_pred))
print(classification_report(y_true=speech_act_test,y_pred=speech_act_pred))
print(classification_report(y_true=sarcasm_test,y_pred=sarcasm_pred))
# print(classification_report(y_true=dangerous_test,y_pred=dangerous_pred))

In [None]:
model.save_weights('model_3L4.h5')

# Import the dataset from kaggle

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sarcasm-data-arabic-language:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4243361%2F7312608%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240524%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240524T231919Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8dd7d7cd131c35071db5e9137c4f95ffacd62130b25beab74978c3d4e55f231550bf74c190a25dfc7e5ab91d2d7186774827f5671949d97b0756b83903ee46dc0e94f78d554c57ae7e9f21f5b1af4d98a44eb3b162610f73a5468e2d091581b1c82df0f691d6dcac12d6430391134fd056531fde2ec3038ad23dc2ad7393ae81abe44ea9379be237ef74b07fb130605ba921532f7251e94e47905730739f27c9cec3ff148e9db7cbd993bea59504c620c927f1c04084d73bb43c20cfa7dc3d041d4ee3d2867768c669de064c3bad7fefa48bc5e216bcd23de79945d823f15bb1bdbf4d426ecb57e10879e254c7fad03bb6cb4f5d3d4fff1eb1e3874235b1240b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading sarcasm-data-arabic-language, 786381 bytes compressed
Downloaded and uncompressed: sarcasm-data-arabic-language
Data source import complete.


#Import important libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras import backend as K

# Make a use of TPU

In [None]:
import tensorflow as tf
print("Tensorflow version " + tf.__version__)

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    #print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Tensorflow version 2.12.0


# Load the model and the tokenizer

The model is MARBERTv2, it is a BERT model trained on arabic language by UBC.

This is the model hugging face repo:
https://huggingface.co/UBC-NLP/MARBERTv2

In [None]:
with tpu_strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained('UBC-NLP/MARBERTv2',num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/757 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/652M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at UBC-NLP/MARBERTv2 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Load and transform adata

In [None]:
sarcasm_train = pd.read_csv('/kaggle/input/sarcasm-data-arabic-language/ArSarcasm_train.csv')
sarcasm_test = pd.read_csv('/kaggle/input/sarcasm-data-arabic-language/ArSarcasm_test.csv')

In [None]:
# sarcasm_train = pd.read_csv ('/content/test_data.csv' , encoding='utf-8')
# sarcasm_test = pd.read_csv ('/content/ArSarcasm_test.csv', encoding='cp1256')

In [None]:
sarcasm_train.head()

Unnamed: 0,dialect,sarcasm,sentiment,original_sentiment,tweet,source
0,gulf,False,negative,negative,"""نصيحه ما عمرك اتنزل لعبة سوبر ماريو مش زي ما ...",semeval
1,msa,False,neutral,positive,"""#نادين_نسيب_نجيم ❤️❤️❤️مجلة #ماري_كلير 💭#ملكة...",semeval
2,egypt,False,neutral,neutral,"""@Alito_NBA اتوقع انه بيستمر""",semeval
3,levant,True,neutral,negative,"""@KSA24 يعني ""بموافقتنا"" لأن دمشق صايرة موسكو""",semeval
4,msa,False,neutral,negative,"""RT @alaahmad20: قائد في الحرس يعترف بفقدان ال...",semeval


The target column is sarcasm

now we need to convert the sarcasem column to one hot encoded

#### Show data distribution

In [None]:
sarcasm_train['sarcasm'].value_counts()

False    7100
True     1337
Name: sarcasm, dtype: int64

#### The data is inbalanced, so we will assign a wieght to every class

In [None]:
counts = sarcasm_train['sarcasm'].value_counts()
class_weights = 1 - (counts / counts.sum())

In [None]:
class_weights = class_weights.tolist()

In [None]:
class_weights

[0.15846864999407373, 0.8415313500059263]

#### As we see, we give the class with lower count a high weight

### Transform the data

In [None]:
targer_values = sarcasm_train['sarcasm'].astype(int)

In [None]:
targer_values[0:5]

0    0
1    0
2    0
3    1
4    0
Name: sarcasm, dtype: int64

In [None]:
zero_arr = np.zeros([targer_values.shape[0],2])
targer_values = targer_values.to_numpy()
zero_arr[np.arange(targer_values.shape[0]), targer_values] = 1
y = zero_arr

In [None]:
y[0:5]

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [None]:
X = sarcasm_train['tweet']

Now we need to tokenize the text to be ready to input to the model

In [None]:
X_tokens = tokenizer(X.tolist(), padding=True)

Now make a tensorflow dataset and batch it

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(X_tokens),
      y.tolist()
  ))

In [None]:
train_dataset = train_dataset.batch(32)

# Train the model

#### A function to compute a F1 score to use it as a metrics

In [None]:
def f_beta_score(y_true, y_pred):
    beta=1
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))

        recall = TP / (Positives+K.epsilon())
        return recall


    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

        precision = TP / (Pred_Positives+K.epsilon())
        return precision

    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)

    return (beta+1)*((precision*recall)/(precision+recall+K.epsilon()))

#### The optimizer is AdamW
#### The loss function is **Focal cross entropy** becuase the data is unbalanced and **Focal cross entropy** helps to learn hard examples

In [None]:
optimizer = tf.keras.optimizers.AdamW(learning_rate=2e-5, weight_decay=0.001)

loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)  # Set from_logits according to your model's output

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
model.fit(train_dataset, epochs=5)

Epoch 1/5


AttributeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1268, in step_function  **
        
    File "/usr/local/lib/python3.10/dist-packages/keras/engine/training.py", line 1249, in run_step  **
        
    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1630, in train_step
        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'


# Inference

In [None]:
def predict(text):
    tokens = tokenizer(text, padding=True,return_tensors='tf')
    predictions_prob = model(**tokens).logits.numpy()
    predictions = predictions_prob.argmax()
    return 'sarcastic' if predictions == 1 else "not-sarcastic"

In [None]:
text = """
السلام عليكم
"""
predict(text)