# Import libraries

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Upload csv, extract the text input feature and the target variable y

In [3]:
df = pd.read_csv("/content/drive/MyDrive/BDE/RNN/merged_yt_collection_preprocessedtxt.csv")
text = df["preprocessed_text"].to_numpy()
y = df["moderationStatus"].to_numpy()

# Text encoder layer definition

In [4]:
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization()
encoder.adapt(text)
len_voc = len(encoder.get_vocabulary())

# Model definition

In [5]:
latent_dim = 32

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len_voc,
        output_dim=latent_dim,
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(latent_dim)),
    tf.keras.layers.Dense(latent_dim, activation='relu'),
    tf.keras.layers.Dense(1,activation="sigmoid")
])

# Train and test split

In [6]:
test_size = 0.2
X_train,X_test,Y_Train,Y_test = train_test_split(text,y,test_size=test_size,random_state=42,stratify = y)

# Compiling and training of the model

In [None]:
epochs = 5
batch_size = 32
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(),metrics=["accuracy",tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])
model.fit(x=X_train, y=Y_Train, epochs=epochs, validation_data = (X_test,Y_test),batch_size = batch_size)

Epoch 1/5

# Evaluating the model

In [24]:
pred = model.predict(X_test)
y_pred = np.float32(pred>=0.5)
macro_f1_score = f1_score(Y_test,y_pred,average="macro")
print(f"Macro f1-score: {macro_f1_score}")

Macro f1-score: 0.6497354506249216


# Save model

In [29]:
path_weights = f"/content/drive/MyDrive/BDE/Models/weights_RNN_{macro_f1_score}.tf"
model.save_weights(path_weights)

# Predict on test set

In [30]:
path_test = "/content/drive/MyDrive/BDE/Filtered_collections/y_test_collection.json"
csv_test_path  = "/content/drive/MyDrive/BDE/Predictions/y_test.csv"

with open(path_test, 'r') as test_file:
    test_json = json.load(test_file)

df_test= pd.read_csv(csv_test_path)

In [31]:
df_test_text = pd.read_csv("/content/drive/MyDrive/BDE/RNN/merged_yt_collection_test_preprocessedtxt.csv")
text_test = df_test_text["preprocessed_text"].to_numpy()

In [32]:
pred = model.predict(text_test)
y_pred = np.float32(pred>=0.5)



In [33]:
df_test["moderationStatus"] =  np.where(y_pred == 0, "not moderated", "moderated") 
df_test.to_csv(f"/content/drive/MyDrive/BDE/Predictions/RNN_model_{macro_f1_score:.10f}_y.csv",index=False)