In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS']='1'
os.environ['PYTHONHASHSEED']=str(42)
import fasttext
import numpy as np
from gensim.models import KeyedVectors
from kerastuner import HyperParameters
from kerastuner.tuners import BayesianOptimization
import tensorflow as tf
import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, TextVectorization, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report
import re
import morfeusz2
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from IPython.display import clear_output
from datetime import timedelta
import time
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
tf.keras.utils.set_random_seed(42)
keras.utils.set_random_seed(42)

def clean_text(text, stop_words, morf):
    text = re.sub(r'[\s]+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text).strip()
    text = text.lower()
    lemmatized_words = []
    for word in text.split():
        analysis = morf.analyse(word)
        if analysis:
            lemmatized_word = analysis[0][2][1].split(':')[0]
            lemmatized_words.append(lemmatized_word)
        else:
            lemmatized_words.append(word)
    text2 = " ".join(lemmatized_words)
    text3 = " ".join([w for w in text2.split() if w not in stop_words])
    return [text, text2, text3]

def LSTM_Model(vectorizer, embedding, architecture, lstm_units, dense_units):
  model = Sequential()
  model.add(vectorizer)
  model.add(embedding)
  if architecture == 'LSTM':
    model.add(LSTM(lstm_units))
  elif architecture == 'BILSTM':
    model.add(Bidirectional(LSTM(lstm_units)))
  model.add(Dense(dense_units, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  return model

def create_embedding_matrix(embedding, vocabulary, embedding_model):
  vocabulary_size = len(vocabulary)
  if embedding == 'Word2Vec':
    dim = embedding_model.vector_size
    matrix = np.zeros((vocabulary_size, dim), dtype='float32')
    for idx, token in enumerate(vocabulary):
      if idx <= 1:
        continue
      if token in embedding_model:
            matrix[idx] = embedding_model[token]
      else:
            matrix[idx] = np.random.normal(scale=0.6, size=dim)
  elif embedding == 'FastText':
    dim = embedding_model.get_dimension()
    matrix = np.zeros((vocabulary_size, dim), dtype='float32')
    for idx, token in enumerate(vocabulary):
      if idx <= 1:
        continue
      if token:
            matrix[idx] = embedding_model.get_word_vector(token)
      else:
            matrix[idx] = np.random.normal(scale=0.6, size=dim)
  return matrix, dim

def Model_Train(x_train, y_train, x_test, y_test, x_val, y_val, embedding, embedding_dim, architecture, lstm_units, dense_units, learning_rate, max_tokens, output_sequence_length, epochs, embedding_model = None):
  vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
  vectorizer.adapt(x_train)
  vocabulary = vectorizer.get_vocabulary()
  if embedding == 'Word2Vec':
        matrix, embedding_dim = create_embedding_matrix('Word2Vec', vocabulary, embedding_model)
  elif embedding == 'FastText':
        matrix, embedding_dim = create_embedding_matrix('FastText', vocabulary, embedding_model)
  if embedding == 'None':
    embedding_layer = Embedding(input_dim=len(vocabulary), output_dim=embedding_dim, mask_zero=True)
  else:
    embedding_layer = Embedding(input_dim=len(vocabulary),output_dim=embedding_dim, weights=[matrix], mask_zero=True, trainable=True)

  Model = LSTM_Model(vectorizer, embedding_layer, architecture, lstm_units, dense_units)
  Model.compile(optimizer=Adam(learning_rate=learning_rate),loss='binary_crossentropy', metrics=['accuracy'])
  start = time.time()
  history = Model.fit(x_train, y_train, epochs=epochs, validation_data=(x_val, y_val), callbacks=[EarlyStopping(monitor='val_loss', patience=5)])
  train_time = time.time() - start
  y_pred = Model.predict(x_test, verbose=0).ravel()
  y_pred = (y_pred >= 0.5).astype(int)
  report = classification_report(y_test, y_pred, digits=4, output_dict=True)
  cm = confusion_matrix(y_test, y_pred)
  return {
    'history': history,
    'report': pd.DataFrame(report).T,
    'confusion_matrix': pd.DataFrame(cm),
    'epochs': len(history.history['loss']),
    'time': train_time
    }


def HyperparameterOptimization(x_train, y_train, x_test, y_test, X_val, y_val, max_trials, epochs, embedding_model):
  def build_model(hp: HyperParameters):
    max_tokens = hp.Int("max_tokens", 10000, 100000, step=10000)
    output_sequence_length = hp.Int("output_sequence_length", 10, 200, step=30)
    lstm_units=hp.Int("lstm_units", 32, 512, step=64)
    dense_units = hp.Int("dense_units", 32, 256, step=32)
    lr = hp.Float("learning_rate", 1e-5, 1e-2, sampling="log")
    vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    vectorizer.adapt(x_train)
    vocabulary = vectorizer.get_vocabulary()
    vocabulary_size = len(vocabulary)
    matrix, embedding_dim = create_embedding_matrix('Word2Vec', vocabulary, embedding_model)
    embedding_layer = keras.layers.Embedding(input_dim=vocabulary_size,output_dim=embedding_dim,weights=[matrix],mask_zero=True,trainable=True)
    model = keras.Sequential()
    model.add(vectorizer)
    model.add(embedding_layer)
    model.add(keras.layers.Bidirectional(keras.layers.LSTM(lstm_units)))
    model.add(keras.layers.Dense(dense_units, activation='relu'))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
    return model
  tuner = BayesianOptimization(
    build_model,
    objective="val_accuracy",
    max_trials=max_trials,
    seed = 42,
    directory="HyperparameterOptimization",
    project_name="HyperparameterOptimization"
  )
  start = time.time()
  tuner.search(x_train, y_train, epochs=epochs, validation_data=(X_val, y_val), verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])
  search_time = time.time() - start
  best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
  best_model = tuner.get_best_models(num_models=1)[0]
  y_pred = best_model.predict(x_test).ravel()
  y_pred = (y_pred >= 0.5).astype(int)
  report = classification_report(y_test, y_pred, digits=4, output_dict=True)
  cm = confusion_matrix(y_test, y_pred)
  return {
    'best_hyperparameters': best_hp,
    'report': pd.DataFrame(report).T,
    'confusion_matrix': pd.DataFrame(cm),
    'time': search_time
  }

def make_ds(texts, labels,BATCH_SIZE, AUTOTUNE, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(texts), seed=42)
    return ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)

def Bert(x_train, y_train, x_test, y_test, x_val, y_val, epochs):
  tf.get_logger().setLevel('ERROR')
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3', name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3', trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs, training=False)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1, seed = 42)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  classifier_model = tf.keras.Model(text_input, net)
  AUTOTUNE   = tf.data.AUTOTUNE
  BATCH_SIZE = 32
  train_ds = make_ds(x_train, y_train, BATCH_SIZE, AUTOTUNE, shuffle=True)
  test_ds  = make_ds(x_test, y_test, BATCH_SIZE, AUTOTUNE,  shuffle=False)
  steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
  num_train_steps = steps_per_epoch * epochs
  num_warmup_steps = int(0.1*num_train_steps)

  init_lr = 3e-5
  optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
  loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
  metrics = tf.metrics.BinaryAccuracy()
  classifier_model.compile(optimizer=optimizer,loss=loss,metrics=metrics)
  start = time.time()
  history = classifier_model.fit(x=train_ds, epochs=epochs, validation_data=(x_val, y_val), callbacks=[EarlyStopping(monitor='val_loss', patience=5)])
  train_time = time.time() - start
  y_pred = classifier_model.predict(test_ds)
  y_pred = tf.squeeze(tf.round(tf.sigmoid(y_pred))).numpy().astype(int)
  report = classification_report(y_test, y_pred, digits=4, output_dict=True)
  cm     = confusion_matrix(y_test, y_pred)
  return {
    'history': history,
    'report': pd.DataFrame(report).T,
    'confusion_matrix': pd.DataFrame(cm),
    'epochs': len(history.history['loss']),
    'time': train_time
    }


def main():
  results = []
  data = pd.read_csv("BAN-PL.csv", encoding="utf-8")
  text_columns = ['cleaned_text', 'cleaned_text1', 'cleaned_text2']
  with open('polish.stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(line.strip().lower() for line in file if line.strip())
  morf = morfeusz2.Morfeusz()
  cleaned = data["Text"].apply(lambda t: clean_text(t, stop_words, morf)).tolist()
  data[text_columns] = pd.DataFrame(cleaned, columns=text_columns, index=data.index)
  X = data[text_columns]
  y = data['Class']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
  X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)
  res1 = Model_Train(X_train['cleaned_text'].values, y_train, X_test['cleaned_text'].values, y_test, X_val['cleaned_text'].values, y_val, 'None', 128, 'LSTM', 128, 64, 0.001, 30000, 50, 100)
  display(res1['report'])
  results.append(('Eksperyment1', res1))
  res2 = Model_Train(X_train['cleaned_text1'].values, y_train, X_test['cleaned_text1'].values, y_test, X_val['cleaned_text1'].values, y_val, 'None', 128, 'LSTM', 128, 64, 0.001, 30000, 50, 100)
  display(res2['report'])
  results.append(('Eksperyment2', res2))
  empty_train_indices = X_train.index[(X_train['cleaned_text2'].isnull()) | (X_train['cleaned_text2'] == '')]
  empty_test_indices = X_test.index[(X_test['cleaned_text2'].isnull()) | (X_test['cleaned_text2'] == '')]
  empty_val_indices = X_val.index[(X_val['cleaned_text2'].isnull()) | (X_val['cleaned_text2'] == '')]
  X_train_filtered = X_train.drop(index=empty_train_indices).copy()
  y_train_filtered = y_train.drop(index=empty_train_indices).copy()
  X_test_filtered = X_test.drop(index=empty_test_indices).copy()
  y_test_filtered = y_test.drop(index=empty_test_indices).copy()
  X_val_filtered = X_val.drop(index=empty_val_indices).copy()
  y_val_filtered = y_val.drop(index=empty_val_indices).copy()
  res3 = Model_Train(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 'None', 128, 'LSTM', 128, 64, 0.001, 30000, 50, 100)
  display(res3['report'])
  results.append(('Eksperyment3', res3))
  w2v_model = KeyedVectors.load("w2v/word2vec_100_3_polish.bin")
  res4 = Model_Train(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 'Word2Vec', 128, 'LSTM', 128, 64, 0.001, 30000, 50, 100, w2v_model)
  display(res4['report'])
  results.append(('Eksperyment4', res4))
  ft_model = fasttext.load_model("cc.pl.300.bin")
  res5 = Model_Train(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 'FastText', 128, 'LSTM', 128, 64, 0.001, 30000, 50, 100, ft_model)
  ft_model = None
  display(res5['report'])
  results.append(('Eksperyment5', res5))
  res6 = Model_Train(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 'Word2Vec', 128, 'BILSTM', 128, 64, 0.001, 30000, 50, 100, w2v_model)
  display(res6['report'])
  results.append(('Eksperyment6', res6))
  res7 = Bert(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 100)
  display(res7['report'])
  results.append(('Eksperyment7', res7))
  res8 = HyperparameterOptimization(X_train_filtered['cleaned_text2'].values, y_train_filtered, X_test_filtered['cleaned_text2'].values, y_test_filtered, X_val_filtered['cleaned_text2'].values, y_val_filtered, 20, 100, w2v_model)
  display(res8['report'])
  results.append(('Eksperyment8', res8))
  clear_output(wait=True)
  for name, out in results:
    print(f"\n===== {name} =====")
    if name == 'Eksperyment8':
      print("Czas optymalizacji:", timedelta(seconds=out['time']), out['time'])
      print("Najlepsze hiperparametry:")
      for param in ["lstm_units", "dense_units", "learning_rate", "max_tokens", "output_sequence_length"]:
        print(f"{param}: {out['best_hyperparameters'].get(param)}")
    else:
      print("Czas treningu: ", timedelta(seconds=out['time']), out['time'])
      print("Liczba epok: ", out['epochs'])
    print("Raport:")
    display(out['report'])
    print("Confusion matrix:")
    display(out['confusion_matrix'])
if __name__=="__main__":
    main()



===== Eksperyment1 =====
Czas treningu:  0:01:37.180032 97.18003249168396
Liczba epok:  6
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.827528,0.811667,0.81952,1200.0
1,0.815209,0.830833,0.822947,1200.0
accuracy,0.82125,0.82125,0.82125,0.82125
macro avg,0.821368,0.82125,0.821234,2400.0
weighted avg,0.821368,0.82125,0.821234,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,974,226
1,203,997



===== Eksperyment2 =====
Czas treningu:  0:02:25.688815 145.68881511688232
Liczba epok:  6
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.829805,0.849167,0.839374,1200.0
1,0.845563,0.825833,0.835582,1200.0
accuracy,0.8375,0.8375,0.8375,0.8375
macro avg,0.837684,0.8375,0.837478,2400.0
weighted avg,0.837684,0.8375,0.837478,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1019,181
1,209,991



===== Eksperyment3 =====
Czas treningu:  0:01:43.458985 103.45898485183716
Liczba epok:  6
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.864086,0.8,0.830809,1200.0
1,0.813809,0.874167,0.842909,1200.0
accuracy,0.837083,0.837083,0.837083,0.837083
macro avg,0.838948,0.837083,0.836859,2400.0
weighted avg,0.838948,0.837083,0.836859,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,960,240
1,151,1049



===== Eksperyment4 =====
Czas treningu:  0:02:26.530735 146.53073477745056
Liczba epok:  7
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.84345,0.88,0.861338,1200.0
1,0.874564,0.836667,0.855196,1200.0
accuracy,0.858333,0.858333,0.858333,0.858333
macro avg,0.859007,0.858333,0.858267,2400.0
weighted avg,0.859007,0.858333,0.858267,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1056,144
1,196,1004



===== Eksperyment5 =====
Czas treningu:  0:06:54.434251 414.434250831604
Liczba epok:  6
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.85025,0.851667,0.850958,1200.0
1,0.851419,0.85,0.850709,1200.0
accuracy,0.850833,0.850833,0.850833,0.850833
macro avg,0.850834,0.850833,0.850833,2400.0
weighted avg,0.850834,0.850833,0.850833,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1022,178
1,180,1020



===== Eksperyment6 =====
Czas treningu:  0:02:16.478016 136.47801613807678
Liczba epok:  6
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.842723,0.8975,0.869249,1200.0
1,0.890374,0.8325,0.860465,1200.0
accuracy,0.865,0.865,0.865,0.865
macro avg,0.866549,0.865,0.864857,2400.0
weighted avg,0.866549,0.865,0.864857,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1077,123
1,201,999



===== Eksperyment7 =====
Czas treningu:  1:17:57.229126 4677.2291264534
Liczba epok:  8
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.877036,0.8975,0.88715,1200.0
1,0.895051,0.874167,0.884486,1200.0
accuracy,0.885833,0.885833,0.885833,0.885833
macro avg,0.886044,0.885833,0.885818,2400.0
weighted avg,0.886044,0.885833,0.885818,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1077,123
1,151,1049



===== Eksperyment8 =====
Czas optymalizacji: 0:44:04.520423 2644.5204226970673
Najlepsze hiperparametry:
lstm_units: 224
dense_units: 32
learning_rate: 0.01
max_tokens: 40000
output_sequence_length: 160
Raport:


Unnamed: 0,precision,recall,f1-score,support
0,0.862969,0.881667,0.872218,1200.0
1,0.879046,0.86,0.869419,1200.0
accuracy,0.870833,0.870833,0.870833,0.870833
macro avg,0.871008,0.870833,0.870818,2400.0
weighted avg,0.871008,0.870833,0.870818,2400.0


Confusion matrix:


Unnamed: 0,0,1
0,1058,142
1,168,1032
