In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preprocessing data for IELTS Writing Task 2 generating sample
## Step 1: Read csv of 3 different dataset
## Step 2: Preprocessing all of them for NaN or inconsistency data (average score different with band)
## Step 3: Consistent about column name and features, then combined them all

In [None]:
human_dataset = pd.read_csv('/kaggle/input/engnovate-ielts/human_rated_ielts_buddies.csv')

In [None]:
human_dataset

In [None]:
human_dataset.shape
# 17 lines, 11 cols => cannot used to be generating a good evaluation

In [None]:
human_dataset = human_dataset[['Topic', 'Essay', 'Overall Score (Band)', 'Task Response Score', 'Coherence & Cohesion Score', 'Lexical Resource Score', 'Grammatical Range and Accuracy Score']]

In [None]:
human_dataset

In [None]:
human_dataset = human_dataset.rename(columns = {'Overall Score (Band)': 'Band', 'Task Response Score': 'TR', 'Coherence & Cohesion Score': 'CC', 'Lexical Resource Score': 'LR', 'Grammatical Range and Accuracy Score': 'GR'})

In [None]:
human_dataset

In [None]:
augmented_dataset = pd.read_csv('/kaggle/input/engnovate-ielts/Augmented_Dataset.csv')

In [None]:
augmented_dataset

In [None]:
augmented_dataset['Band'] = augmented_dataset[['Task Response', 'Coherence and Cohesion', 'Lexical Resource', 'Grammatical Range and Accuracy']].mean(axis = 1)

In [None]:
def ielts_round(score):
    if score % 1 < 0.25:
        return int(score)
    elif score % 1 < 0.75:
        return int(score) + 0.5
    else:
        return int(score) + 1.0

In [None]:
augmented_dataset

In [None]:
augmented_dataset['Band']  = augmented_dataset['Band'].apply(ielts_round)

In [None]:
augmented_dataset = augmented_dataset.rename(columns = {'prompt': 'Topic', 'essay': 'Essay', 'Task Response': 'TR', 'Coherence and Cohesion': 'CC', 'Lexical Resource': 'LR', 'Grammatical Range and Accuracy': 'GR'})

In [None]:
augmented_dataset

In [None]:
augmented_dataset = augmented_dataset.drop(columns = ['prompt_id'])
augmented_dataset

In [None]:
new_version_dataset = pd.read_csv('/kaggle/input/engnovate-ielts/writing9_2025edition.csv')

In [None]:
new_version_dataset

In [None]:
new_version_dataset = new_version_dataset.rename(columns = {'topic': 'Topic', 'essay': 'Essay', 'Task Response': 'TR', 'Coherence and Cohesion': 'CC', 'Lexical Resource': 'LR', 'Grammatical Range and Accuracy':'GR'})

In [None]:
new_version_dataset

In [None]:
# combined dataset now
df = pd.concat([human_dataset, augmented_dataset, new_version_dataset], ignore_index = True)


In [None]:
df

In [None]:
# check for inconsistency
df['computed_avg'] = df[['TR', 'CC', 'LR', 'GR']].mean(axis=1).apply(ielts_round)
df['is_consistency'] = ((df['computed_avg'] - df['Band']).abs() <= 1)

In [None]:
df = df[df['is_consistency']] 

In [None]:
df

In [None]:
# drop 2 columns used to check consistency
df = df.drop(columns = ['computed_avg', 'is_consistency'])

In [None]:
df

# 2. Divide it into 3 new custom dataset: 
- For generating IELTS essay
- Evaluating band score
- Evaluating detailed score 


In [None]:
df_for_generating_essay = df[['Topic', 'Essay', 'Band']]
df_for_evaluating_single_score = df_for_generating_essay
df_for_evaluating_detail_score = df[['Topic', 'Essay', 'TR', 'CC', 'LR', 'GR']]


In [None]:
df_for_generating_essay
df_for_generating_essay.to_csv('IELTS-gen_dataset.csv')

In [None]:
df_for_evaluating_single_score
df_for_evaluating_single_score.to_csv('IELTS-predictBand_dataset.csv')

In [None]:
df_for_evaluating_detail_score
df_for_evaluating_detail_score.to_csv('IELTS-predictDetailed_dataset.csv')

# Generative AI model for generating essays  

In [None]:
df_for_generating_essay

In [None]:
average_band_score_dataset = df_for_generating_essay['Band'].mean()
average_band_score_dataset

In [None]:
# we will use gpt2-neo instruction 125M parameter + customed eos tokenizer for better stopping
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("uoe-nlp/gpt-neo-125m_instruction-tuned_sni")
model = AutoModelForCausalLM.from_pretrained("uoe-nlp/gpt-neo-125m_instruction-tuned_sni")

In [None]:
tokenizer.add_tokens(['[END]'])
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
print("Load model successfully")
# embedding of gpt: word_size = 50258,embed_dim = 768

In [None]:
# Divide data into train, test, val (0.7, 0.2 , 0.1)
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_for_generating_essay, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_df, test_size = 0.1, random_state = 42)

In [None]:
len(train_df)

In [None]:
len(val_df)

In [None]:
len(test_df)

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
train_df

In [None]:
tokenizer

In [None]:
train_dataset

In [None]:
def preprocess_function_causal(example):
    prompt = example["Topic"]
    essay = example["Essay"] + '[END]' # add an custom [END] tokenizer
    score = example["Band"]

    prompt_text = f"From the topic: {prompt}, please write an IELTS essay that can achieve band {score}:\n Essay: \n"
    full_text = f"{prompt_text} {essay}"

    full_enc = tokenizer(
        full_text,
        max_length=512,
        truncation=True,
        padding="max_length",  
        return_attention_mask=True
    )
    input_ids = full_enc["input_ids"]
    attention_mask = full_enc["attention_mask"]

    prompt_enc = tokenizer(
        prompt_text,
        add_special_tokens=False,
        padding=False,
        truncation=False
    )
    prompt_len = len(prompt_enc["input_ids"])
    
    labels = input_ids.copy() # this is full sentence form.

    for i in range(min(prompt_len, len(labels))):
        labels[i] = -100  # ignore index in loss, used for predict phrase

    return {
        "input_ids": input_ids,                 # [512]
        "attention_mask": attention_mask,       # [512]
        "labels": labels                        # [512]
    }

In [None]:
train_dataset = train_dataset.map(preprocess_function_causal, batched=False)
val_dataset = val_dataset.map(preprocess_function_causal, batched=False)
test_dataset = test_dataset.map(preprocess_function_causal, batched=False)

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Because GPT-2/GPT-Neo are causal models, not masked language models
)

In [None]:
# !pip install peft


In [None]:
# from peft import get_peft_model, LoraConfig, TaskType

# # LoRA config
# peft_config = LoraConfig(
#     r=8,
#     lora_alpha=16,
#     task_type=TaskType.CAUSAL_LM,
#     lora_dropout=0.1,
#     bias="none",
#     inference_mode=False
# )

# # Wrap model with LoRA
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()  # confirm which params are tunable


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt_ielts",
    save_strategy="epoch",
    save_total_limit=1,  
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    seed=42,
    weight_decay=0.05,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps = 500,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model='loss'
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    data_collator=data_collator,
    tokenizer = tokenizer
)

In [None]:
trainer.train()

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

def generate_ielts_essay(question, overall, max_length=512):
    model.eval()
    input_text = f"From the topic: {question}, please write an IELTS essay that can achieve band {overall}:\n Essay: \n"
    # print(input_text)
    # Tokenize the input text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to('cuda')
    
    # Generate the essay using the model
    output_ids = model.generate(
        input_ids,
        eos_token_id=tokenizer.convert_tokens_to_ids('[END]'),
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=3,  # To prevent repetition
        early_stopping=True,
        temperature=0.3,  # For controlled creativity
        top_p=0.85,        # Top-p sampling
        top_k=50,         # Top-k sampling
        do_sample=True    # Enable sampling
    )
    
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

# Example usage:
sample_question = "Some people believe that governments should invest more in public transport. To what extent do you agree or disagree?"
sample_overall = "7.0"  # For example, if you want to condition on a score of 7

print("Generated Essay:\n", generate_ielts_essay(sample_question, sample_overall))
