# Notebook: IELTS Writing Task 2 Scoring with Mistral-7B-Instruct-v0.2 (Kaggle)

This notebook demonstrates how to use the Mistral-7B-Instruct-v0.2 model to automatically score IELTS Writing Task 2 essays. The pipeline uses standardized schemas, validation, prompts, and batch evaluation as in the reference system.

In [None]:
# Section 1: Install and import required libraries
!pip install --upgrade pip
!pip install -q transformers accelerate torch jsonschema

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import base64
from jsonschema import validate, ValidationError

In [None]:
# Section 2: Load Mistral-7B-Instruct model directly from Hugging Face
import os

MODEL_NAME = 'mistralai/Mistral-7B-Instruct-v0.2'
TOKENIZER_DIR = '/kaggle/working/mistral-7b-instruct-v0.2-tokenizer-cache'
MODEL_DIR = '/kaggle/working/mistral-7b-instruct-v0.2-cache'

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_Token")


# Load Hugging Face token from environment variable
if not hf_token:
    raise ValueError("Please set the HF_TOKEN environment variable with your Hugging Face access token.")

if os.path.exists(MODEL_DIR) and os.path.exists(TOKENIZER_DIR):
    print('Loading model and tokenizer from local cache...')
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_DIR,
        device_map='auto',
        trust_remote_code=True
    )
else:
    print('Downloading model and tokenizer from Hugging Face...')
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_auth_token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map='auto',
        trust_remote_code=True,
        use_auth_token=hf_token
    )
    tokenizer.save_pretrained(TOKENIZER_DIR)
    # model.save_pretrained(MODEL_DIR)  # Skipped to avoid disk full error on Kaggle

model.eval()

In [None]:
# Section X: Call model with fixed IELTS Task 2 prompt (using loaded model) (for debugging)

fixed_prompt = (
    "You are an IELTS Writing Task 2 examiner. "
    "Score the following essay (0-9) and provide feedback for each criterion: Task Response, Coherence and Cohesion, Lexical Resource, Grammatical Range and Accuracy. "
    "Return ONLY a valid JSON object with this structure (do not copy the values):\n"
    "{\n"
    "  \"overall\": <float>,\n"
    "  \"per_criterion\": {\n"
    "    \"Task Response\": <float>,\n"
    "    \"Coherence and Cohesion\": <float>,\n"
    "    \"Lexical Resource\": <float>,\n"
    "    \"Grammatical Range and Accuracy\": <float>\n"
    "  },\n"
    "  \"feedback\": {\n"
    "    \"Task Response\": <string>,\n"
    "    \"Coherence and Cohesion\": <string>,\n"
    "    \"Lexical Resource\": <string>,\n"
    "    \"Grammatical Range and Accuracy\": <string>\n"
    "  }\n"
    "}\n"
    "Now score the following essay and generate your own scores and detailed feedback (do not leave any feedback field empty):\n"
    "Question: Many people believe that social networking sites have a huge negative impact on both individuals and society. To what extent do you agree or disagree?\n"
    "Essay: Many people believe that social networking sites, such as Facebook, have had a negative impact on individuals and society. While these platforms offer opportunities for communication and information sharing, I agree that their overall effect has been more harmful than beneficial.\n\nFirstly, social networking sites can lead to addiction and reduced productivity. Many users spend hours scrolling through feeds, which distracts them from work, study, or real-life relationships. This excessive use can result in poor academic or job performance and even mental health issues such as anxiety and depression.\n\nSecondly, these platforms often facilitate the spread of misinformation and cyberbullying. False news can go viral quickly, misleading large numbers of people. Moreover, the anonymity provided by social media allows some individuals to harass or bully others without facing real consequences.\n\nHowever, it is important to acknowledge that social networking sites also have positive aspects. They help people stay connected with friends and family, especially those living far away, and can be valuable tools for professional networking and learning.\n\nIn conclusion, although social networking sites have some benefits, I believe their negative impacts on productivity, mental health, and social harmony outweigh the positives. Therefore, individuals and governments should take steps to minimize these harms."
)

inputs = tokenizer(fixed_prompt, return_tensors='pt').to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=512, do_sample=False)
raw_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Raw model output:\n", raw_result)

import json
import re
from jsonschema import validate, ValidationError

# Define the expected JSON schema
schema = {
    'type': 'object',
    'properties': {
        'overall': {'type': 'number'},
        'per_criterion': {'type': 'object'},
        'feedback': {'type': 'object'}
    },
    'required': ['overall', 'per_criterion', 'feedback']
}

def extract_final_json_block(text):
    """
    Extract the final IELTS scoring JSON block from a long string.

    Args:
        text (str): The full string containing instructions, essay, and JSON.

    Returns:
        dict: Parsed JSON object with 'overall', 'per_criterion', and 'feedback'.

    Raises:
        ValueError: If no valid JSON block is found.
    """
    matches = re.findall(r'\{(?:[^{}]|(?R))*\}', text, re.DOTALL)

    # Try parsing each block from the end
    for candidate in reversed(matches):
        # Skip blocks with placeholders
        if "<" in candidate or ">" in candidate:
            continue
        try:
            parsed = json.loads(candidate)
            if all(k in parsed for k in ["overall", "per_criterion", "feedback"]):
                return parsed
        except json.JSONDecodeError:
            continue

    raise ValueError("No valid JSON block found.")

actual_result = extract_final_json_block(raw_result)
print("Parsed and validated JSON result:\n", actual_result)

In [None]:
# Section 3: Define input schema based on score_request.v1.json
score_request_schema = {
    '$schema': 'https://json-schema.org/draft/2020-12/schema',
    '$id': 'score_request.v1.json',
    'title': 'ScoreRequestV1',
    'type': 'object',
    'additionalProperties': False,
    'properties': {
        'task_type': {'type': 'string', 'enum': ['task1', 'task2']},
        'essay': {'type': 'string', 'minLength': 1, 'maxLength': 20000},
        'question': {'type': 'string', 'minLength': 5, 'maxLength': 1000},
        'image_base64': {'type': 'string'},
        'options': {
            'type': 'object',
            'additionalProperties': False,
            'properties': {
                'max_evidence': {'type': 'integer', 'minimum': 1, 'maximum': 3}
            }
        }
    },
    'required': ['task_type', 'essay'],
    'allOf': [
        {
            'if': {'properties': {'task_type': {'const': 'task1'}}},
            'then': {'required': ['essay']}
        }
    ]
}

def validate_score_request(data):
    try:
        validate(instance=data, schema=score_request_schema)
        print('Input is valid according to schema.')
        return True
    except ValidationError as e:
        print(f'Schema error: {e}')
        return False

In [None]:
# Section 4: Preprocess input data (essay, question, image_base64)
def preprocess_input(data):
    essay = data.get('essay', '')
    question = data.get('question', '')
    image = None
    if data.get('image_base64'):
        try:
            image = base64.b64decode(data['image_base64'])
        except Exception:
            image = None
    return essay, question, image


In [None]:
def get_standard_task2_prompt(essay, question):
    return (
        "You are an IELTS Writing Task 2 examiner. "
        "Score the following essay (0-9) and provide feedback for each criterion: Task Response, Coherence and Cohesion, Lexical Resource, Grammatical Range and Accuracy. "
        "Return ONLY a valid JSON object with this structure (replace <float> and <string> with your own values, do not copy them):\n"
        "{\n"
        "  \"overall\": <float>,\n"
        "  \"per_criterion\": {\n"
        "    \"Task Response\": <float>,\n"
        "    \"Coherence and Cohesion\": <float>,\n"
        "    \"Lexical Resource\": <float>,\n"
        "    \"Grammatical Range and Accuracy\": <float>\n"
        "  },\n"
        "  \"feedback\": {\n"
        "    \"Task Response\": <string>,\n"
        "    \"Coherence and Cohesion\": <string>,\n"
        "    \"Lexical Resource\": <string>,\n"
        "    \"Grammatical Range and Accuracy\": <string>\n"
        "  }\n"
        "}\n"
        "Now score the following essay and generate your own scores and detailed feedback (do not leave any feedback field empty):\n"
        "Question: " + question + "\n"
        "Essay: " + essay
    )


In [None]:
def score_ielts_task2_qwen2(essay, question, model, tokenizer, max_new_tokens=512):
    import torch
    import re
    import json
    prompt = get_standard_task2_prompt(essay, question)
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    return extract_final_json_block(result)


In [None]:
# Section 6: Test scoring with sample data
sample_data = {
    'task_type': 'task2',
    'essay': """Many people believe that social networking sites, such as Facebook, have had a negative impact on individuals and society. While these platforms offer opportunities for communication and information sharing, I agree that their overall effect has been more harmful than beneficial.\n\nFirstly, social networking sites can lead to addiction and reduced productivity. Many users spend hours scrolling through feeds, which distracts them from work, study, or real-life relationships. This excessive use can result in poor academic or job performance and even mental health issues such as anxiety and depression.\n\nSecondly, these platforms often facilitate the spread of misinformation and cyberbullying. False news can go viral quickly, misleading large numbers of people. Moreover, the anonymity provided by social media allows some individuals to harass or bully others without facing real consequences.\n\nHowever, it is important to acknowledge that social networking sites also have positive aspects. They help people stay connected with friends and family, especially those living far away, and can be valuable tools for professional networking and learning.\n\nIn conclusion, although social networking sites have some benefits, I believe their negative impacts on productivity, mental health, and social harmony outweigh the positives. Therefore, individuals and governments should take steps to minimize these harms.""",
    'question': 'Many people believe that social networking sites have a huge negative impact on both individuals and society. To what extent do you agree or disagree?',
    'options': {'max_evidence': 2}
}

if validate_score_request(sample_data):
    essay, question, _ = preprocess_input(sample_data)
    result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
    print("Scoring result:")
    print(result)
else:
    result = {'error': 'Invalid input.'}
    print(result)

In [None]:
# Section 7: Display scoring result
import pprint
pprint.pprint(result)

In [None]:
# Section 8: Batch test with sample dataset (simulated)
import pandas as pd

# Simulated sample dataset
dataset = [
    {
        'task_type': 'task2',
        'essay': """Some people think that the best way to increase road safety is to increase the minimum legal age for driving cars or riding motorbikes. While raising the age limit may reduce accidents among young drivers, I believe that education and stricter law enforcement are more effective solutions.\n\nFirstly, increasing the legal age does not guarantee that all new drivers will be responsible. Many accidents are caused by reckless behavior, which can occur at any age. Instead, comprehensive driver education programs can teach young people about the dangers of speeding, drunk driving, and distracted driving.\n\nSecondly, strict enforcement of traffic laws, such as penalties for using mobile phones while driving or not wearing seat belts, can deter dangerous behavior. Regular road safety campaigns and random checks can also raise awareness and encourage compliance.\n\nIn conclusion, while raising the minimum driving age might have some effect, I believe that education and law enforcement are more effective ways to improve road safety for everyone.""",
        'question': 'Some people think that the best way to increase road safety is to increase the minimum legal age for driving cars or riding motorbikes. To what extent do you agree or disagree?',
        'options': {'max_evidence': 2},
    },
    {
        'task_type': 'task2',
        'essay': """Some people believe that unpaid community service should be a compulsory part of high school programmes. I agree that students can benefit greatly from such experiences, but I do not think it should be mandatory.\n\nOn the one hand, volunteering helps students develop important life skills such as teamwork, communication, and empathy. It also allows them to contribute positively to their communities and gain a sense of responsibility.\n\nOn the other hand, making community service compulsory may lead to resentment among students who are not genuinely interested. It could also place additional pressure on those who are already struggling with academic demands.\n\nIn conclusion, while community service is valuable, I believe it should be encouraged rather than required in high school programmes.""",
        'question': 'Some people believe that unpaid community service should be a compulsory part of high school programmes (for example, working for a charity, improving the neighbourhood or teaching sports to younger children). To what extent do you agree or disagree?',
        'options': {'max_evidence': 2},
    }
]

results = []
for i, row in enumerate(dataset):
    if validate_score_request(row):
        essay, question, _ = preprocess_input(row)
        result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
        print(f"Result for sample {i}:")
        print(result)
        results.append({'index': i, 'result': result})
    else:
        print(f"Result for sample {i}: Invalid input.")
        results.append({'index': i, 'result': {'error': 'Invalid input.'}})

# Display batch results
df_results = pd.DataFrame(results)
df_results

In [None]:
# # Section 9: Ensure output matches score_response.v1.json format
# def validate_score_response_format(response):
#     schema = {
#         'type': 'object',
#         'properties': {
#             'overall': {'type': 'number'},
#             'per_criterion': {'type': 'object'},
#             'feedback': {'type': 'object'}
#         },
#         'required': ['overall', 'per_criterion', 'feedback']
#     }
#     try:
#         validate(instance=response, schema=schema)
#         return True
#     except ValidationError as e:
#         print(f'Output does not match required format: {e}')
#         return False

# # Check batch results
# for r in results:
#     if 'error' in r['result']:
#         print(f"Index {r['index']}: Invalid model output: {r['result']['error']}")
#     else:
#         print(f"Index {r['index']}:", validate_score_response_format(r['result']))

In [None]:
# Section 10: Load and normalize chillies/IELTS-writing-task-2-evaluation dataset
from datasets import load_dataset
import pandas as pd

dataset_name = 'chillies/IELTS-writing-task-2-evaluation'
df = load_dataset(dataset_name, split='test').to_pandas()
# Normalize columns
if 'band' in df.columns:
    df = df.rename(columns={'band': 'band_true'})
if 'id' not in df.columns:
    df['id'] = range(len(df))
df['word_count'] = df['essay'].astype(str).str.split().map(len)
df = df[['id', 'prompt', 'essay', 'band_true', 'word_count']]
df.head()

In [None]:
# Section 11: Batch scoring on the full test set with Qwen2-7B
from tqdm import tqdm

results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    data = {
        'task_type': 'task2',
        'essay': row['essay'],
        'question': row['prompt'],
        'options': {'max_evidence': 2}
    }
    if validate_score_request(data):
        essay, question, _ = preprocess_input(data)
        result = score_ielts_task2_qwen2(essay, question, model, tokenizer)
        band_pred = result.get('overall', None)
    else:
        band_pred = None
    results.append({'id': row['id'], 'band_true': row['band_true'], 'band_pred': band_pred})

df_pred = pd.DataFrame(results)
df_pred.head()

In [None]:
# Section 12: Compute evaluation metrics MAE, RMSE, QWK, Pearson, Spearman
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import cohen_kappa_score
import numpy as np

def safe_round(x):
    try:
        return round(float(x))
    except:
        return np.nan

df_pred = df_pred.dropna(subset=['band_pred']).copy()
df_pred['band_pred_round'] = df_pred['band_pred'].map(safe_round)

mae = mean_absolute_error(df_pred['band_true'], df_pred['band_pred'])
rmse = mean_squared_error(df_pred['band_true'], df_pred['band_pred'], squared=False)
qwk = cohen_kappa_score(df_pred['band_true'], df_pred['band_pred_round'], weights='quadratic')
pearson = pearsonr(df_pred['band_true'], df_pred['band_pred'])[0]
spearman = spearmanr(df_pred['band_true'], df_pred['band_pred'])[0]

print(f'MAE: {mae:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'QWK: {qwk:.4f}')
print(f'Pearson: {pearson:.4f}')
print(f'Spearman: {spearman:.4f}')

In [None]:
# Section 13: Display comparison table and plots of predictions and ground truth
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Show table
subset = df_pred[['id', 'band_true', 'band_pred', 'band_pred_round']].head(20)
print(subset)
# Bar plot: predicted vs. true band scores
plt.figure(figsize=(12,5))
plt.bar(subset['id']-0.2, subset['band_true'], width=0.4, label='True Band', color='skyblue')
plt.bar(subset['id']+0.2, subset['band_pred_round'], width=0.4, label='Predicted Band', color='orange')
plt.xlabel('Sample ID')
plt.ylabel('Band Score')
plt.title('True vs. Predicted Band Scores (First 20 Samples)')
plt.legend()
plt.show()
# Scatter plot: all predictions
plt.figure(figsize=(6,6))
sns.scatterplot(x='band_true', y='band_pred', data=df_pred, alpha=0.5)
plt.plot([df_pred['band_true'].min(), df_pred['band_true'].max()], [df_pred['band_true'].min(), df_pred['band_true'].max()], 'r--')
plt.xlabel('True Band')
plt.ylabel('Predicted Band')
plt.title('Scatter Plot: True vs. Predicted Band (All Samples)')
plt.show()
# Confusion matrix (rounded)
cm = confusion_matrix(df_pred['band_true'], df_pred['band_pred_round'])
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=sorted(df_pred['band_true'].unique()), yticklabels=sorted(df_pred['band_true'].unique()))
plt.xlabel('Predicted Band (rounded)')
plt.ylabel('True Band')
plt.title('Confusion Matrix: True vs. Predicted Band (Rounded)')
plt.show()
# Summary statistics
print('Summary statistics:')
print(df_pred[['band_true', 'band_pred']].describe())
# Table of evaluation metrics
import pandas as pd
metrics_table = pd.DataFrame({
    'Metric': ['MAE', 'RMSE', 'QWK', 'Pearson', 'Spearman'],
    'Value': [mae, rmse, qwk, pearson, spearman]
})
print('Evaluation Metrics:')
print(metrics_table)

In [None]:
# Section 14: Save test set prediction results to file
output_path = '/kaggle/working/ielts_task2_predictions.csv'
df_pred.to_csv(output_path, index=False)
print(f"Saved predictions to {output_path}")