In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import pandas as pd
import re
import string
import nltk
import numpy as np

In [None]:
train = pd.read_csv("../BERT_FineTune/processed_with_extra_remove_train.csv")
test = pd.read_csv("../BERT_FineTune/processed_with_extra_remove_test.csv")

In [None]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('romanian'))

def clean_text(text, remove_numbers=True, remove_stopwords=True):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    if remove_numbers:
        text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    if remove_stopwords:
        text = " ".join([word for word in text.split() if word not in stop_words])
    return text

train["text"] = train["text"].apply(lambda x: clean_text(str(x)))
test["text"] = test["text"].apply(lambda x: clean_text(str(x)))


In [None]:
training_size = 65000
vocab_size = 15000
max_length = 100
embedding_size = 512
padding_type= 'post'
trunc_type = 'post'
oov_tkn = '<OOV>'

In [None]:
sentences = train["text"].to_list()
labels = train["class"].to_list()

In [None]:
training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[:training_size]
testing_labels = labels[training_size:]

len(training_sentences),len(testing_labels)

In [None]:

training_sentences = np.array(training_sentences)
testing_sentences = np.array(testing_sentences)

training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tkn,lower=True)
tokenizer.fit_on_texts(training_sentences)

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, padding=padding_type, truncating=trunc_type, maxlen = max_length)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, truncating=trunc_type, maxlen=max_length)

### LSTM Implementation

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1,activation='sigmoid'),
])

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(3e-3), metrics=['accuracy'])

In [None]:
num_epochs = 15

history = model.fit(training_padded, training_labels, epochs=num_epochs, 
                    validation_data=(testing_padded,testing_labels), 
                    verbose=2, batch_size=1500)

In [None]:
model.save('sarcasm_detecting_lstm.keras')

In [None]:
def predict_sarcasm(sequences,threshold=0.5):
    preds = []
    new_sequences = tokenizer.texts_to_sequences(sequences)
    new_sequences_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    predictions = model.predict(new_sequences_padded, verbose=0)
    for i in range(len(predictions)):
        if predictions[i]>=threshold:
            preds.append(1)
        else:
            preds.append(0)
    return preds

In [None]:
new_sentences = test["text"].astype(str).to_list()
p = predict_sarcasm(new_sentences, 0.5)

In [None]:
f = open("ans_LSTM7.csv", 'w')

f.write("id,class\n")

for i, pp in enumerate(p):
    f.write("{i},{p}\n".format(i=i, p=("True" if pp == 1 else "False")))
f.close()