## Imports

In [3]:
from datasets import load_dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, roc_auc_score
from scipy.stats import pearsonr, spearmanr
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


## Datasets

### 1. Pavlick and Tetreault Formality Scores (0 -> informal, 1 -> formal)

In [4]:
dataset = load_dataset("osyvokon/pavlick-formality-scores")

train_data = dataset["train"]
test_data = dataset["test"] 

df_train = pd.DataFrame(train_data.to_pandas())  
df_test = pd.DataFrame(test_data.to_pandas())

In [5]:
df_train.head(10)

Unnamed: 0,domain,avg_score,sentence
0,news,-0.6,Tang was employed at private-equity firm Fried...
1,news,1.0,San Francisco Mayor Gavin Newsom's withdrawal ...
2,answers,-2.8,lol nothing worrying about that.
3,news,0.0,She told Price she wanted to join the Police E...
4,news,1.8,The prime minister is keen to use the autumn p...
5,blog,1.0,Those competencies include mastering fundament...
6,news,0.8,His platform contains plans to fund drainage p...
7,answers,-1.8,"""It's a start."""
8,news,0.6,"""She is not asking for anything over the top, ..."
9,news,0.8,Justice Dinakaran had maintained that he had n...


In [6]:
# data exploration of avg_score
df_train["avg_score"].describe()

count    9274.000000
mean       -0.044080
std         1.349061
min        -3.000000
25%        -1.000000
50%         0.000000
75%         1.000000
max         3.000000
Name: avg_score, dtype: float64

In [7]:
df_train["normalized_score"] = (df_train["avg_score"] + 3) / 6
df_test["normalized_score"] = (df_test["avg_score"] + 3) / 6

In [8]:
# concatenate the train and test data, remove the domain column and rename the normalized_score column to formality
data1 = pd.concat([df_train, df_test])
data1 = data1.drop(columns=["domain", "avg_score"])
data1 = data1.rename(columns={"normalized_score": "formality"})
print(len(data1))
data1.head(10)


11274


Unnamed: 0,sentence,formality
0,Tang was employed at private-equity firm Fried...,0.4
1,San Francisco Mayor Gavin Newsom's withdrawal ...,0.666667
2,lol nothing worrying about that.,0.033333
3,She told Price she wanted to join the Police E...,0.5
4,The prime minister is keen to use the autumn p...,0.8
5,Those competencies include mastering fundament...,0.666667
6,His platform contains plans to fund drainage p...,0.633333
7,"""It's a start.""",0.2
8,"""She is not asking for anything over the top, ...",0.6
9,Justice Dinakaran had maintained that he had n...,0.633333


### 2. FAME-MT Dataset (0 -> informal, 1 -> formal)

In [10]:
# read tsv file
formal = pd.read_csv("it-en.formal.tsv", sep="\t", on_bad_lines="skip")
formal.columns = ["italian", "english"]
informal = pd.read_csv("it-en.informal.tsv", sep="\t", on_bad_lines="skip")
informal.columns = ["italian", "english"]

# drop rows with NaN values
formal = formal.dropna()
informal = informal.dropna()

In [11]:
informal.head(10)

Unnamed: 0,italian,english
0,"Ma è uscito dal corpo, adesso.",But it's out of the body now.
1,"Ho paura, Elliot.","- I'm scared, Elliot."
2,Bella carta da parati per il desktop scaricare...,Charming desktop wallpapers backgrounds - 1440...
3,- Felice di conoscervi.,- Pleased to meet you.
4,"E sicuramente, sei un cazzone di un ufficiale ...","And sure enough, you've been an official assho..."
5,Ma c'e' ancora un sospetto di omicidio e seque...,But we still have a murder and kidnapping susp...
6,Acquista i biglietti per il Gabba,Grab tickets for the Gabba
7,Lei vive in un villaggio carino dove ci sono u...,She lives in a cute village where there are lo...
8,Quindi il tuo elisir funziona.,So your elixir works.
9,Gokudo Video Recensioni Commenti Maggiori info...,Gokudo Videos Reviews Comments More Info


In [12]:
# concatenate the formal and informal data, add a column formality with values 1 for formal and 0 for informal and shuffle the data
formal["formality"] = 1
informal["formality"] = 0
data2 = pd.concat([formal, informal])
data2 = data2.sample(frac=1, random_state=42).reset_index(drop=True)
data2 = data2.drop(columns=["italian"])
data2 = data2.rename(columns={"english": "sentence"})
print(len(data2))
data2.head(10)

96735


Unnamed: 0,sentence,formality
0,Listed are cams in category Neu | Show all Cat...,0
1,Weather forecast Arraial d Ajuda this week Sunday,0
2,"Yeah, you know, I'm not one for sentimental crap.",0
3,"For the same reasons, and taking into account ...",1
4,Restaurants West End Village,0
5,"Amazingly enough, this program can eliminate t...",1
6,"Sorry, Mariana called, then Jude needed me, an...",0
7,What Wikipedia say about CContent of green cof...,0
8,Download Underground Tour game...,0
9,Will you...,0


## Models 

### Pre-trained

In [14]:
model_name_mdeberta = "s-nlp/mdeberta-base-formality-ranker"
tokenizer_mdeberta = AutoTokenizer.from_pretrained(model_name_mdeberta)
model_mdeberta = AutoModelForSequenceClassification.from_pretrained(model_name_mdeberta)

model_name_xlmr = "s-nlp/xlmr_formality_classifier"
tokenizer_xlmr = AutoTokenizer.from_pretrained(model_name_xlmr)
model_xlmr = AutoModelForSequenceClassification.from_pretrained(model_name_xlmr)

In [16]:
def get_formality_probability(text, tokenizer, model):
    """
    Given a text, this function tokenizes the input,
    runs inference through the model, and returns the softmax probability
    for the "formal" label (assumed label 1).
    """
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # Run inference; ensure tensors are on the same device as model
    outputs = model(**inputs)
    # Compute softmax over logits to get probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    # Return the probability for label 1 ("formal")
    return probabilities[0][1].item()

### trained and test n-gram model

In [17]:
texts_1 = data1["sentence"]
labels_1 = [1 if i > 0.5 else 0 for i in data1["formality"]]

char_vectorizer_1 = CountVectorizer(analyzer='char', ngram_range=(2,6), min_df=1, max_df=1.0)

# Create a pipeline with the vectorizer and a classifier
pipeline_1 = make_pipeline(char_vectorizer_1, LogisticRegression(max_iter=1000))

# Split data for training and testing (example split)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(texts_1, labels_1, test_size=0.2, random_state=42)

# Train the classifier
pipeline_1.fit(X_train_1, y_train_1)

# Evaluate the classifier
predictions_1 = pipeline_1.predict(X_test_1)
print("Accuracy:", accuracy_score(y_test_1, predictions_1))

Accuracy: 0.7521064301552106


In [None]:
prec = precision_score(y_test_1, predictions_1, average='binary')  
rec = recall_score(y_test_1, predictions_1, average='binary')
f1 = f1_score(y_test_1, predictions_1, average='binary')
conf_mat = confusion_matrix(y_test_1, predictions_1)
roc_auc = roc_auc_score(y_test_1, predictions_1)

print("Precision:", prec)
print("Recall:", rec)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_mat)
print("ROC-AUC:", roc_auc)

Precision: 0.7704315886134068
Recall: 0.730836236933798
F1-Score: 0.7501117568171658
Confusion Matrix:
 [[857 250]
 [309 839]]
ROC-AUC: 0.7525003226222738


In [19]:
texts = data2["sentence"]
labels = data2["formality"]

char_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2,6), min_df=1, max_df=1.0)

# Create a pipeline with the vectorizer and a classifier
pipeline = make_pipeline(char_vectorizer, LogisticRegression(max_iter=1000))

# Split data for training and testing (example split)
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Train the classifier
pipeline.fit(X_train, y_train)

# Evaluate the classifier
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.9331679330128703


In [20]:
prec = precision_score(y_test, predictions, average='binary')  # adjust 'binary' or 'macro' based on your data
rec = recall_score(y_test, predictions, average='binary')
f1 = f1_score(y_test, predictions, average='binary')
conf_mat = confusion_matrix(y_test, predictions)
roc_auc = roc_auc_score(y_test, predictions)

print("Precision:", prec)
print("Recall:", rec)
print("F1-Score:", f1)
print("Confusion Matrix:\n", conf_mat)
print("ROC-AUC:", roc_auc)

Precision: 0.9456971452213487
Recall: 0.9225103420441934
F1-Score: 0.9339598549466265
Confusion Matrix:
 [[8911  525]
 [ 768 9143]]
ROC-AUC: 0.9334361799241738


### LLM

In [141]:
model_name_LLM = "EleutherAI/gpt-neo-1.3B"
tokenizer_LLM = AutoTokenizer.from_pretrained(model_name_LLM)
model_LLM = AutoModelForCausalLM.from_pretrained(model_name_LLM)

In [127]:
def predict_formality(text, tokenizer, model):
    prompt = (f"Please rate the following sentence's formality on a scale from 1 (very informal) to 5 (very formal).\n\n"
              f"Sentence: \"{text}\"\nFormality rating:")
    
    # Encode the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate output (adjust max_new_tokens as needed)
    outputs = model.generate(**inputs, max_new_tokens=5, temperature=0.0, do_sample=False)
    
    # Decode and extract the rating
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the rating number from the response (simple approach)
    rating = response.split("Formality rating:")[-1].strip().split()[0]
    return rating

# Example usage:
sample_text = "Dear Sir or Madam,"
rating = predict_formality(sample_text, tokenizer_LLM, model_LLM)
print("Predicted formality rating:", rating)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Predicted formality rating: 1


In [132]:
def chat(text, tokenizer, model):
    # Combine conversation history with the new input
    prompt = text
    
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Generate output with sampling and a set maximum of new tokens
    outputs = model.generate(
        **inputs, 
        max_new_tokens=50,  
        do_sample=False,
        top_k=50,  # optionally, control diversity with top_k sampling
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output and extract the response (remove the prompt)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Optionally, remove the original prompt from the response
    response = response.replace(prompt, "").strip()
    
    return response

In [146]:
user_input = "who are you?"
response = chat(user_input, tokenizer_LLM, model_LLM)
print(response)

I am a writer, a mother, a wife, a daughter, a sister, a friend, a lover, a friend of Jesus, a friend of the world, a friend of the poor, a friend of the world’s


## Inference

In [32]:
sample_sentences1 = data1["sentence"].tolist()[:100]
sample_sentences2 = data2["sentence"].tolist()[:100]

In [37]:
results1 = []
for i, sent in enumerate(sample_sentences1):
    prob_xlmr = get_formality_probability(sent, tokenizer_xlmr, model_xlmr)
    prob_mdberta = get_formality_probability(sent, tokenizer_mdeberta, model_mdeberta)
    results1.append({
        "sentence": sent,
        "xlmr_formality_prob": prob_xlmr,
        "mdeberta-base-formality-ranker": prob_mdberta})

df_results1 = pd.DataFrame(results1)

# invert the one with zeros
df_results1["xlmr_formality_prob"] = 1 - df_results1["xlmr_formality_prob"]
df_results1["mdeberta-base-formality-ranker"] = 1 - df_results1["mdeberta-base-formality-ranker"]

In [48]:
get_formality_probability("Congratulations on achieving this ambitious objective.", tokenizer_xlmr, model_xlmr), get_formality_probability("I am a student", tokenizer_mdeberta, model_mdeberta)

(0.0013433765852823853, 0.9915924072265625)

In [38]:
results2 = []
for i, sent in enumerate(sample_sentences2):
    prob_xlmr = get_formality_probability(sent, tokenizer_xlmr, model_xlmr)
    prob_mdberta = get_formality_probability(sent, tokenizer_mdeberta, model_mdeberta)
    results2.append({
        "sentence": sent,
        "xlmr_formality_prob": prob_xlmr,
        "mdeberta-base-formality-ranker": prob_mdberta})
df_results2 = pd.DataFrame(results2)

# invert the one with zeros
df_results2["xlmr_formality_prob"] = 1 - df_results2["xlmr_formality_prob"]
df_results2["mdeberta-base-formality-ranker"] = 1 - df_results2["mdeberta-base-formality-ranker"]

Processed 0 sentences
Processed 10 sentences
Processed 20 sentences
Processed 30 sentences
Processed 40 sentences
Processed 50 sentences
Processed 60 sentences
Processed 70 sentences
Processed 80 sentences
Processed 90 sentences


In [44]:
# analyze the results
df_results1["xlmr_formality_prob"].describe(), df_results1["mdeberta-base-formality-ranker"].describe(), df_results2["xlmr_formality_prob"].describe(), df_results2["mdeberta-base-formality-ranker"].describe()

(count    100.000000
 mean       0.781186
 std        0.385278
 min        0.002608
 25%        0.840318
 50%        0.998527
 75%        0.998639
 max        0.998666
 Name: xlmr_formality_prob, dtype: float64,
 count    100.000000
 mean       0.013183
 std        0.034896
 min        0.000891
 25%        0.002557
 50%        0.004460
 75%        0.007833
 max        0.307690
 Name: mdeberta-base-formality-ranker, dtype: float64,
 count    100.000000
 mean       0.715734
 std        0.420436
 min        0.003262
 25%        0.185834
 50%        0.997417
 75%        0.998625
 max        0.998677
 Name: xlmr_formality_prob, dtype: float64,
 count    100.000000
 mean       0.030187
 std        0.102882
 min        0.000832
 25%        0.002577
 50%        0.004249
 75%        0.011433
 max        0.738821
 Name: mdeberta-base-formality-ranker, dtype: float64)

In [39]:
# Merge the results with the human normalized scores (using the same order as sample_sentences)
df_sample = df_test.head(100).copy()
df_sample = df_sample.reset_index(drop=True)
df_sample = pd.concat([df_sample, df_results1.drop(columns="sentence")], axis=1)

print("\nComparison of Predictions:")
df_sample[["sentence", "normalized_score", "xlmr_formality_prob", "mdeberta-base-formality-ranker"]]


Comparison of Predictions:


Unnamed: 0,sentence,normalized_score,xlmr_formality_prob,mdeberta-base-formality-ranker
0,Saleh said the detainees told interrogators th...,0.666667,0.998518,0.001700
1,"i own this board, now.",0.125000,0.998631,0.005246
2,will lead you into blind alleys.,0.166667,0.002674,0.001454
3,If you have any questions or wish to speak fur...,0.866667,0.998608,0.002493
4,"However, your case may be different.",0.600000,0.998626,0.036147
...,...,...,...,...
95,There is much more that happened behind the sc...,0.791667,0.998574,0.138463
96,"I realise that, as a libertarian, I might well...",0.533333,0.996865,0.008894
97,"If you go below that, they can sue you.",0.366667,0.998665,0.007308
98,Engadget assumes no responsibility for injury ...,0.766667,0.998641,0.002122


## Evaluation

In [40]:
y = df_sample["normalized_score"]
x = df_sample["xlmr_formality_prob"]

x = [1 if i > 0.5 else 0 for i in x]
y = [1 if i > 0.5 else 0 for i in y] 

In [41]:
binary_true = y  # ground truth binary labels
binary_pred = x  # model binary predictions

# Compute metrics:
acc = accuracy_score(binary_true, binary_pred)
prec = precision_score(binary_true, binary_pred)
rec = recall_score(binary_true, binary_pred)
f1 = f1_score(binary_true, binary_pred)
cm = confusion_matrix(binary_true, binary_pred)

print("Binary Evaluation Metrics:")
print(f"Accuracy:  {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("Confusion Matrix:")
print(cm)

Binary Evaluation Metrics:
Accuracy:  0.500
Precision: 0.500
Recall:    0.780
F1 Score:  0.609
Confusion Matrix:
[[11 39]
 [11 39]]


In [17]:
# Assume continuous_true and continuous_pred are the ground truth and model predictions,
# with values between 0 and 1. For example:
continuous_true = df_sample["normalized_score"].tolist()
continuous_pred = df_sample["xlmr_formality_prob"].tolist()

# Calculate error metrics:
mae = mean_absolute_error(continuous_true, continuous_pred)
mse = mean_squared_error(continuous_true, continuous_pred)
r2 = r2_score(continuous_true, continuous_pred)

# Calculate correlation coefficients:
pearson_corr, _ = pearsonr(continuous_true, continuous_pred)
spearman_corr, _ = spearmanr(continuous_true, continuous_pred)

print("\nContinuous Evaluation Metrics:")
print(f"Mean Absolute Error (MAE):  {mae:.3f}")
print(f"Mean Squared Error (MSE):   {mse:.3f}")
print(f"R² Score:                   {r2:.3f}")
print(f"Pearson Correlation:        {pearson_corr:.3f}")
print(f"Spearman Correlation:       {spearman_corr:.3f}")


Continuous Evaluation Metrics:
Mean Absolute Error (MAE):  0.368
Mean Squared Error (MSE):   0.177
R² Score:                   -2.071
Pearson Correlation:        0.600
Spearman Correlation:       0.763


## Results

Binary Evaluation Metrics:
Accuracy:  0.670
Precision: 0.605
Recall:    0.980
F1 Score:  0.748
Confusion Matrix:
[[18 32]
 [ 1 49]]

Continuous Evaluation Metrics:
Mean Absolute Error (MAE):  0.368
Mean Squared Error (MSE):   0.177
R² Score:                   -2.071
Pearson Correlation:        0.600
Spearman Correlation:       0.763