# Notebook: IELTS Writing Task 2 Scoring with Qwen2-7B (Kaggle)

This notebook demonstrates how to use the Qwen2-7B-Instruct model to automatically score IELTS Writing Task 2 essays. The pipeline uses standardized schemas, validation, prompts, and batch evaluation as in the reference system.

In [None]:
# Section 1: Install and import required libraries
!pip install --upgrade pip
!pip install -q transformers accelerate torch jsonschema

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import base64
from jsonschema import validate, ValidationError

In [None]:
# Section 2: Load Qwen2-7B model with local cache
import os

MODEL_NAME = 'Qwen/Qwen2-7B'
MODEL_DIR = '/kaggle/working/qwen2-7b-cache'
TOKENIZER_DIR = '/kaggle/working/qwen2-7b-tokenizer-cache'

if os.path.exists(MODEL_DIR) and os.path.exists(TOKENIZER_DIR):
    print('Loading model and tokenizer from local cache...')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_DIR,
        device_map='auto',
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
else:
    print('Downloading model and tokenizer from Hugging Face...')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map='auto',
        torch_dtype=torch.float16,
        trust_remote_code=True
    )
    tokenizer.save_pretrained(TOKENIZER_DIR)
    model.save_pretrained(MODEL_DIR)
model.eval()

In [None]:
# Section 3: Define input schema based on score_request.v1.json
score_request_schema = {
    '$schema': 'https://json-schema.org/draft/2020-12/schema',
    '$id': 'score_request.v1.json',
    'title': 'ScoreRequestV1',
    'type': 'object',
    'additionalProperties': False,
    'properties': {
        'task_type': {'type': 'string', 'enum': ['task1', 'task2']},
        'essay': {'type': 'string', 'minLength': 1, 'maxLength': 20000},
        'question': {'type': 'string', 'minLength': 5, 'maxLength': 1000},
        'image_base64': {'type': 'string'},
        'options': {
            'type': 'object',
            'additionalProperties': False,
            'properties': {
                'max_evidence': {'type': 'integer', 'minimum': 1, 'maximum': 3}
            }
        }
    },
    'required': ['task_type', 'essay'],
    'allOf': [
        {
            'if': {'properties': {'task_type': {'const': 'task1'}}},
            'then': {'required': ['essay']}
        }
    ]
}

def validate_score_request(data):
    try:
        validate(instance=data, schema=score_request_schema)
        print('Input is valid according to schema.')
        return True
    except ValidationError as e:
        print(f'Schema error: {e}')
        return False

In [None]:
# Section 4: Preprocess input data (essay, question, image_base64)
def preprocess_input(data):
    essay = data.get('essay', '')
    question = data.get('question', '')
    image = None
    if data.get('image_base64'):
        try:
            image = base64.b64decode(data['image_base64'])
        except Exception:
            image = None
    return essay, question, image

In [None]:
# Section 5: Scoring function using Qwen2-7B
def get_standard_task2_prompt(essay, question):
    return (
        f'You are an IELTS Writing Task 2 examiner.\n'
        f'Score the following essay according to the IELTS rubric (0-9) and provide feedback for each criterion (Task Response, Coherence and Cohesion, Lexical Resource, Grammatical Range and Accuracy).\n'
        f'Question: {question}\nEssay: {essay}\n'
        f'Return a JSON object with keys: overall, per_criterion (dict), feedback (dict).'
)

def score_ielts_task2_qwen2(essay, question, model, tokenizer, max_new_tokens=128):  # reduced for Kaggle
    prompt = get_standard_task2_prompt(essay, question)
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    import torch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    import re
    match = re.search(r'\{.*\}', result, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except Exception:
            return {'error': 'Could not parse JSON from model output.'}
    return {'error': 'No JSON found in model output.'}

In [None]:
# Section 6: Test scoring with sample data
sample_data = {
    'task_type': 'task2',
    'essay': 'In today’s world, technology has become an integral part of our lives. Some people believe that it has improved our quality of life, while others think it has made life more complicated. Discuss both views and give your own opinion.',
    'question': 'Some people believe that technology has improved our quality of life, while others think it has made life more complicated. Discuss both views and give your own opinion.',
    'options': {'max_evidence': 2}
}

if validate_score_request(sample_data):
    essay, question, _ = preprocess_input(sample_data)
    result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
else:
    result = {'error': 'Invalid input.'}

In [None]:
# Section 7: Display scoring result
import pprint
pprint.pprint(result)

In [None]:
# Section 8: Batch test with sample dataset (simulated)
import pandas as pd

# Simulated sample dataset (replace with real file if available)
dataset = [
    {
        'task_type': 'task2',
        'essay': 'Some people think that the best way to increase road safety is to increase the minimum legal age for driving cars or riding motorbikes. To what extent do you agree or disagree?',
        'question': 'Some people think that the best way to increase road safety is to increase the minimum legal age for driving cars or riding motorbikes. To what extent do you agree or disagree?',
        'options': {'max_evidence': 2}
    },
    {
        'task_type': 'task2',
        'essay': 'Many people believe that social networking sites have a huge negative impact on both individuals and society. To what extent do you agree or disagree?',
        'question': 'Many people believe that social networking sites have a huge negative impact on both individuals and society. To what extent do you agree or disagree?',
        'options': {'max_evidence': 2}
    }
]

results = []
for i, row in enumerate(dataset):
    if validate_score_request(row):
        essay, question, _ = preprocess_input(row)
        result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
        results.append({'index': i, 'result': result})
    else:
        results.append({'index': i, 'result': {'error': 'Invalid input.'}})

# Display batch results
df_results = pd.DataFrame(results)
df_results

In [None]:
# Section 9: Ensure output matches score_response.v1.json format
def validate_score_response_format(response):
    schema = {
        'type': 'object',
        'properties': {
            'overall': {'type': 'number'},
            'per_criterion': {'type': 'object'},
            'feedback': {'type': 'object'}
        },
        'required': ['overall', 'per_criterion', 'feedback']
    }
    try:
        validate(instance=response, schema=schema)
        return True
    except ValidationError as e:
        print(f'Output does not match required format: {e}')
        return False

# Check batch results
for r in results:
    print(f'Index {r["index"]}:', validate_score_response_format(r['result']))

In [None]:
# Section 10: Load and normalize chillies/IELTS-writing-task-2-evaluation dataset
from datasets import load_dataset
import pandas as pd

dataset_name = 'chillies/IELTS-writing-task-2-evaluation'
df = load_dataset(dataset_name, split='test').to_pandas()
# Normalize columns
if 'band' in df.columns:
    df = df.rename(columns={'band': 'band_true'})
if 'id' not in df.columns:
    df['id'] = range(len(df))
df['word_count'] = df['essay'].astype(str).str.split().map(len)
df = df[['id', 'prompt', 'essay', 'band_true', 'word_count']]
df.head()

In [None]:
# Section 11: Batch scoring on the full test set with Qwen2-7B
from tqdm import tqdm

results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    data = {
        'task_type': 'task2',
        'essay': row['essay'],
        'question': row['prompt'],
        'options': {'max_evidence': 2}
    }
    if validate_score_request(data):
        essay, question, _ = preprocess_input(data)
        result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
        band_pred = result.get('overall', None)
    else:
        band_pred = None
    results.append({'id': row['id'], 'band_true': row['band_true'], 'band_pred': band_pred})

df_pred = pd.DataFrame(results)
df_pred.head()

In [None]:
# Section 12: Compute evaluation metrics MAE, RMSE, QWK, Pearson, Spearman
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import cohen_kappa_score
import numpy as np

def safe_round(x):
    try:
        return round(float(x))
    except:
        return np.nan

df_pred = df_pred.dropna(subset=['band_pred']).copy()
df_pred['band_pred_round'] = df_pred['band_pred'].map(safe_round)

mae = mean_absolute_error(df_pred['band_true'], df_pred['band_pred'])
rmse = mean_squared_error(df_pred['band_true'], df_pred['band_pred'], squared=False)
qwk = cohen_kappa_score(df_pred['band_true'], df_pred['band_pred_round'], weights='quadratic')
pearson = pearsonr(df_pred['band_true'], df_pred['band_pred'])[0]
spearman = spearmanr(df_pred['band_true'], df_pred['band_pred'])[0]

print(f'MAE: {mae:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'QWK: {qwk:.4f}')
print(f'Pearson: {pearson:.4f}')
print(f'Spearman: {spearman:.4f}')

In [None]:
# Section 13: Display comparison table of predictions and ground truth
df_pred[['id', 'band_true', 'band_pred', 'band_pred_round']].head(20)

In [None]:
# Section 14: Save test set prediction results to file
output_path = '/kaggle/working/ielts_task2_predictions.csv'
df_pred.to_csv(output_path, index=False)
print(f"Saved predictions to {output_path}")