In [None]:
import os
import re
import pandas as pd
import tensorflow as tf
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, create_optimizer
from keras.callbacks import ModelCheckpoint


model = TFAutoModelForSequenceClassification.from_pretrained('Geotrend/bert-base-uk-cased', num_labels=1)
tokenizer = AutoTokenizer.from_pretrained('Geotrend/bert-base-uk-cased')

Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op StatelessRandomGetKeyCounter in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op StatelessTruncatedNormalV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AddV2 in device /job:localhost/replica:0/task:0/device:GPU:0
Executi

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df = pd.read_csv('/content/harmful_cleaned.xlsx - Worksheet(whout memes).csv')
df['label'] = df['label'].astype('category')
df = df.dropna()

df.head()
print(df.dtypes)

text       object
label    category
dtype: object


In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"([a-z])([а-яєіїґ])", r"\1 \2", text)
    text = re.sub(r"([а-яєіїґ])([a-z])", r"\1 \2", text)
    text = re.sub(r"([a-zа-яєіїґ])(\d)", r"\1 \2", text)
    text = re.sub(r"(\d)([a-zа-яєіїґ])", r"\1 \2", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text

df['text'] = df['text'].apply(clean_text)

df.head()

Unnamed: 0,text,label
0,до 15 шахедів атакують зараз одесу монітори у ...,0.0
1,увага сьогодні може бути масштабна атака шахед...,0.0
2,давно шукаєш для себе авто за вигідною ціною т...,1.0
3,10 річний малий зґвалтував свою 7 річну подруг...,0.0
4,український нардеп подарував годинник richard ...,0.0


In [None]:
x,y = df['text'], df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print(y_train.head())

1852    0.0
318     1.0
992     0.0
465     0.0
1713    0.0
Name: label, dtype: category
Categories (3, float64): [0.0, 1.0, 840.0]


In [None]:
def tokenize(text, max_length=128):
    return tokenizer(
        text.tolist(),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf',
    )


train_features = tokenize(x_train)
val_features = tokenize(x_test)

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_features),
    y_train
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_features),
    y_test
)).batch(32)

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op BatchDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0


In [None]:
epochs = 5
num_train_steps = len(train_dataset) * epochs
num_warmup_steps = int(0.1 * num_train_steps)


optimizer, lr_schedule = create_optimizer(
    init_lr=3e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=0.01
)


loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [
    tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

os.makedirs('model_checkpoints', exist_ok=True)

checkpoint_callback = ModelCheckpoint(
    filepath='model_checkpoints/model_epoch_{epoch}.keras',
    save_weights_only=False,
    save_freq='epoch',
    monitor='val_loss',
    save_best_only=False
)

Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Cast in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op RealDiv in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Pow in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Less in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/t

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=5,
    verbose=2,
    # callbacks=[checkpoint_callback]
)

Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LessEqual in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Equal in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op GreaterEqual in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp 

In [None]:
history

<tf_keras.src.callbacks.History at 0x7f2cb73120d0>

In [None]:
history.history

{'loss': [0.061180468648672104,
  0.03306609392166138,
  0.0239361971616745,
  0.007725817151367664],
 'accuracy': [0.9793103337287903,
  0.9899686574935913,
  0.9924764633178711,
  0.9968652129173279],
 'val_loss': [0.533368706703186,
  0.7226611375808716,
  0.7380644679069519,
  0.7751190066337585],
 'val_accuracy': [0.8220551609992981,
  0.8170425891876221,
  0.8195488452911377,
  0.8220551609992981]}

In [None]:
y_pred = model.predict(test_dataset)

Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op LessEqual in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op DatasetCardinality in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Equal in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op GreaterEqual in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op AssignVariableOp 

In [None]:
predictions = tf.where(tf.math.sigmoid(y_pred.logits) > 0.5, 1, 0)
predictions

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Sigmoid in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Greater in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op SelectV2 in device /job:localhost/replica:0/task:0/device:GPU:0


<tf.Tensor: shape=(399, 1), dtype=int32, numpy=
array([[1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
    

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.85      0.83      0.84       226
         1.0       0.78      0.80      0.79       173

    accuracy                           0.82       399
   macro avg       0.81      0.82      0.81       399
weighted avg       0.82      0.82      0.82       399



In [None]:
model.save_pretrained('./my_trained_model')
tokenizer.save_pretrained('./my_trained_model')


Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op Identity in device /job:localhost/replica:0/task:0/device:GPU:0
Executing op ReadVariableOp in device /j

('./my_trained_model/tokenizer_config.json',
 './my_trained_model/special_tokens_map.json',
 './my_trained_model/vocab.txt',
 './my_trained_model/added_tokens.json',
 './my_trained_model/tokenizer.json')

In [None]:
!zip -r my_trained_model.zip my_trained_model

  adding: my_trained_model/ (stored 0%)
  adding: my_trained_model/config.json (deflated 53%)
  adding: my_trained_model/vocab.txt (deflated 56%)
  adding: my_trained_model/tokenizer_config.json (deflated 75%)
  adding: my_trained_model/tf_model.h5 (deflated 7%)
  adding: my_trained_model/tokenizer.json (deflated 72%)
  adding: my_trained_model/special_tokens_map.json (deflated 42%)


In [None]:
from google.colab import files
files.download('my_trained_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/


In [3]:
!git add model_training.ipynb

fatal: pathspec 'model_training.ipynb' did not match any files
