In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

In [None]:
df = pd.read_csv('/content/drive/MyDrive/FYP 2024/DATASET/women_final.csv')

In [None]:
X = df['text'].values
y = df.drop(columns=['id', 'text']).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
label_binarizer = LabelBinarizer()
y_train_binary = label_binarizer.fit_transform(y_train)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)

In [None]:
max_length = 100  # Set the maximum length of sequences
X_train_padded = pad_sequences(X_train_tokenized, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_tokenized, maxlen=max_length, padding='post')

In [None]:
embedding_dim = 50  # Set the embedding dimension
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_length))
model.add(GRU(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=y_train_binary.shape[1], activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train_padded, y_train_binary, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
train_loss, train_accuracy = model.evaluate(X_train_padded, y_train_binary)
print('Train Accuracy:', train_accuracy)

Train Accuracy: 0.6633184552192688


In [None]:
# Load the test dataset
test_df = pd.read_csv('/content/drive/MyDrive/FYP 2024/DATASET/women_test.csv')

# Assuming your test data has a column named 'text' containing text data
X_test_data = test_df['text'].values
y_test_data = test_df.drop(columns=['id', 'text']).values

# Tokenize and pad the test data
X_test_data_tokenized = tokenizer.texts_to_sequences(X_test_data)
X_test_data_padded = pad_sequences(X_test_data_tokenized, maxlen=max_length, padding='post')

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test_data_padded, label_binarizer.transform(y_test_data))
print('Test Accuracy:', test_accuracy)

Test Accuracy: 0.5565550923347473


In [None]:
from tensorflow.keras.models import save_model

# Save the trained model
model.save('/content/drive/MyDrive/FYP 2024/final/GRU_ILSI(2).pth')
