In [1]:
!pip install --upgrade transformers tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow)
  Downloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (21 kB)
Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (644.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.9/644.9 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ml_dtypes-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m64.5 MB/s

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Embedding, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer
import numpy as np
import pickle
import pandas as pd
from google.colab import drive

In [3]:
drive.mount('/content/drive')

train_data = pd.read_csv("/content/drive/MyDrive/IELTS-writing-task-2/datasets/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/IELTS-writing-task-2/datasets/test.csv")

Mounted at /content/drive


In [4]:
# Parameters
MAX_LEN = 512
NUM_CLASSES = 19
EMBED_DIM = 128
BATCH_SIZE = 16
EPOCHS = 10


X_essay = train_data.loc[:, "prompt"].values
X_prompt = train_data.loc[:, "essay"].values
y_ta = train_data.loc[:,"Task_Achievement"].values
y_cc = train_data.loc[:,"Coherence_and_Cohesion"].values
y_lr = train_data.loc[:,"Lexical_Resource"].values
y_gra = train_data.loc[:,"Grammatica_Range_and_Accuracy"].values

# Convert labels to one-hot encoded format
y_ta = tf.keras.utils.to_categorical(y_ta, NUM_CLASSES)
y_cc = tf.keras.utils.to_categorical(y_cc, NUM_CLASSES)
y_lr = tf.keras.utils.to_categorical(y_lr, NUM_CLASSES)
y_gra = tf.keras.utils.to_categorical(y_gra, NUM_CLASSES)




In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize essays and prompts
encoded_essays = tokenizer(X_essay.tolist(), padding='max_length', max_length=MAX_LEN, truncation=True)
padded_sequences_1 = np.array(encoded_essays['input_ids'])

encoded_prompts = tokenizer(X_prompt.tolist(), padding='max_length', max_length=MAX_LEN, truncation=True)
padded_sequences_2 = np.array(encoded_prompts['input_ids'])

# Set vocabulary size
VOCAB_SIZE = tokenizer.vocab_size


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Option 1: Simple LSTM-based model with custom embeddings
def build_lstm_model():
    # Inputs
    essay_input = Input(shape=(MAX_LEN,), name='essay_input')
    prompt_input = Input(shape=(MAX_LEN,), name='prompt_input')

    # Embedding layers
    embedding_layer = Embedding(VOCAB_SIZE, EMBED_DIM, input_length=MAX_LEN)
    essay_embed = embedding_layer(essay_input)
    prompt_embed = embedding_layer(prompt_input)

    # LSTM layers
    lstm_layer = LSTM(128, return_sequences=False)
    essay_lstm = lstm_layer(essay_embed)
    prompt_lstm = lstm_layer(prompt_embed)

    # Concatenate features
    combined = Concatenate()([essay_lstm, prompt_lstm])
    combined = Dropout(0.3)(combined)
    combined = Dense(256, activation='relu')(combined)
    combined = Dropout(0.3)(combined)

    # Output layers (one for each criterion)
    out_ta = Dense(NUM_CLASSES, activation='softmax', name='task_achievement')(combined)
    out_cc = Dense(NUM_CLASSES, activation='softmax', name='coherence_cohesion')(combined)
    out_lr = Dense(NUM_CLASSES, activation='softmax', name='lexical_resource')(combined)
    out_gra = Dense(NUM_CLASSES, activation='softmax', name='grammatical_range')(combined)

    # Build model
    model = Model(inputs=[essay_input, prompt_input],
                  outputs=[out_ta, out_cc, out_lr, out_gra])
    return model

In [7]:
# Build and compile model (use BERT model for better performance)
model = build_lstm_model()
model.compile(optimizer=Adam(learning_rate=2e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy','accuracy','accuracy','accuracy'])

# Print model summary
model.summary()

history = model.fit(
    x=[padded_sequences_1,padded_sequences_2],
    y=[y_ta, y_cc,  y_lr,  y_gra],
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

# Save model
model.save('/content/drive/MyDrive/IELTS-writing-task-2/models/ielts_scoring_model_02.keras')
with open('/content/drive/MyDrive/IELTS-writing-task-2/models/tokenizer_02.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)



Epoch 1/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 53ms/step - coherence_cohesion_accuracy: 0.1296 - coherence_cohesion_loss: 2.7143 - grammatical_range_accuracy: 0.1469 - grammatical_range_loss: 2.7326 - lexical_resource_accuracy: 0.2695 - lexical_resource_loss: 2.6485 - loss: 10.7743 - task_achievement_accuracy: 0.1630 - task_achievement_loss: 2.6789 - val_coherence_cohesion_accuracy: 0.2923 - val_coherence_cohesion_loss: 1.9479 - val_grammatical_range_accuracy: 0.3457 - val_grammatical_range_loss: 1.9053 - val_lexical_resource_accuracy: 0.3473 - val_lexical_resource_loss: 1.8374 - val_loss: 7.4991 - val_task_achievement_accuracy: 0.3350 - val_task_achievement_loss: 1.8090
Epoch 2/10
[1m491/491[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 53ms/step - coherence_cohesion_accuracy: 0.2369 - coherence_cohesion_loss: 2.0451 - grammatical_range_accuracy: 0.3032 - grammatical_range_loss: 1.9960 - lexical_resource_accuracy: 0.3125 - lexical_resource_loss

In [8]:
model.evaluate([padded_sequences_1,padded_sequences_2], [y_ta, y_cc,  y_lr,  y_gra])

[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - coherence_cohesion_accuracy: 0.2980 - coherence_cohesion_loss: 1.8566 - grammatical_range_accuracy: 0.3582 - grammatical_range_loss: 1.7921 - lexical_resource_accuracy: 0.3591 - lexical_resource_loss: 1.7514 - loss: 7.1341 - task_achievement_accuracy: 0.3495 - task_achievement_loss: 1.7340


[7.162295818328857,
 1.7400093078613281,
 1.8676778078079224,
 1.7587801218032837,
 1.7962185144424438,
 0.29347050189971924,
 0.36100640892982483,
 0.3604970872402191,
 0.34572678804397583]