### The following code is used for choosing optimal cefr predictor for CEFR-SP corpus

#### Loading packages

In [None]:
import numpy as np
import pandas as pd
from easse.fkgl import corpus_fkgl
from scipy.stats import pearsonr
from transformers import BertForMaskedLM, AutoTokenizer, AutoModelForMaskedLM
import torch
import torch.nn.functional as F
import math
import re
import argparse
from tqdm import tqdm
import nltk
import os
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--model_path', default = 'C:/PLMs/xlm-roberta-base')
parser.add_argument('--device', default = 'cuda')
args = parser.parse_args()
model = AutoModelForMaskedLM.from_pretrained(args.model_path).to(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
model.eval()

In [None]:
def compute_ari(text):
    """
    计算文本的 Automated Readability Index (ARI) 可读性指标
    :param text: 输入文本（字符串）
    :return: ARI 值（浮点数）
    """
    # 分割句子（按句号、问号、感叹号分割）
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]  # 去除空句子

    if not sentences:
        raise ValueError("文本中没有有效的句子")

    total_characters = 0
    total_words = 0

    # 遍历每个句子，统计字符数和单词数
    for sentence in sentences:
        words = sentence.split()  # 按空格分割单词
        if not words:
            continue  # 跳过空句子

        total_words += len(words)
        for word in words:
            total_characters += len(word)  # 统计字符数（包括符号、数字等）

    if total_words == 0:
        raise ValueError("文本中没有有效的单词")

    # 计算 AWL 和 ASL
    awl = total_characters / total_words
    asl = total_words / len(sentences)

    # 计算 ARI
    ari = 4.71 * awl + 0.5 * asl - 21.43
    return ari

In [None]:
def compute_wnll(yt, yp):
    wnll = -(yt.dot(torch.log(yp)) + (1-yt).dot(torch.log(1-yp)))
    if torch.isnan(wnll): return 1e-10
    return float(wnll)


def compute_rsrs(wnll):
    wnll.sort()
    rsrs = 0
    for i, w in enumerate(wnll):
        squ_root = math.sqrt(i + 1)
        rsrs += squ_root*w
    return rsrs/len(wnll)


def compute_readability(sentence):
    WNLL = []
    sentence_tokenized = word_tokenize(sentence)
    sentence_batch = []
    for id, token in enumerate(sentence_tokenized):
        sentence_mask = sentence_tokenized[:id] + [tokenizer.special_tokens_map["mask_token"]] + sentence_tokenized[id + 1:]
        sentence_mask = " ".join(sentence_mask)
        sentence_batch.append(sentence_mask)

    inputs = tokenizer(sentence_batch, return_tensors = 'pt', padding = True).to(args.device)
    row_indices, col_indices = torch.where(inputs.input_ids == tokenizer.vocab[tokenizer.special_tokens_map["mask_token"]])  # getting MASK token index
    output = model(**inputs).logits
    for oid, mask_id in enumerate(col_indices):
        yp = F.softmax(output[oid, mask_id, :], dim = 0)  # getting MASK token probability
        yt = torch.zeros(len(tokenizer)).to(args.device)
        orig_token_id = tokenizer.encode(sentence_tokenized[oid], add_special_tokens=False)
        yt[orig_token_id] = 1
        wnll = compute_wnll(yt, yp)
        WNLL.append(wnll)
    assert len(WNLL) == len(sentence_tokenized)
    return compute_rsrs(WNLL)

### Testing CEFR-SP corpus

In [None]:
USING_WHOLE = False
if USING_WHOLE: df = pd.read_csv("CEFR-SP_test_whole.csv")
else: df = pd.read_csv("CEFR-SP_test_part.csv")

Sentence = df["Sentence"].tolist()
Label1 = np.array(df["Label1"].tolist())
Label2 = np.array(df["Label2"].tolist())

#### calculating FKGL and analyzing correlation

In [None]:
prediction = [corpus_fkgl([sent]) for sent in Sentence]
prediction = np.array(prediction)

corr1, _ = pearsonr(Label1, prediction)
corr2, _ = pearsonr(Label2, prediction)
print("Corr1: ", round(corr1, 3))   # taking the higher correlation-value
print("Corr2: ", round(corr2, 3))

#### calculating ARI and analyzing correlation

In [None]:
prediction = [compute_ari([sent]) for sent in Sentence]
prediction = np.array(prediction)

corr1, _ = pearsonr(Label1, prediction)
corr2, _ = pearsonr(Label2, prediction)
print("Corr1: ", round(corr1, 3))   # taking the higher correlation-value
print("Corr2: ", round(corr2, 3))

#### calculating SL and analyzing correlation

In [None]:
prediction = [len([sent]) for sent in Sentence]
prediction = np.array(prediction)

corr1, _ = pearsonr(Label1, prediction)
corr2, _ = pearsonr(Label2, prediction)
print("Corr1: ", round(corr1, 3))   # taking the higher correlation-value
print("Corr2: ", round(corr2, 3))

#### calculating RSRS and analyzing correlation

In [None]:
prediction = [compute_readability(sent) for sent in tqdm(Sentence)]
prediction = np.array(prediction)

corr1, _ = pearsonr(Label1, prediction)
corr2, _ = pearsonr(Label2, prediction)
print("Corr1: ", round(corr1, 3))   # taking the higher correlation-value
print("Corr2: ", round(corr2, 3))