# Paraphrase Italian Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Use 'skip' to skip problematic lines and continue parsing
df = pd.read_csv('PACCSS-IT.txt', sep='\t', usecols=range(7), quoting=3)
df.head()

In [None]:
min_confidence = df['Confidence'].min()
max_confidence = df['Confidence'].max()
min_cosine = df['Cosine_Similarity'].min()
max_cosine = df['Cosine_Similarity'].max()

# Display the results
print(f"Confidence range: {min_confidence} to {max_confidence}")
print(f"Cosine Similarity range: {min_cosine} to {max_cosine}")

In [None]:
# drop the last three columns
df = df.iloc[:, :-3]
df.head()

Fix a threshold for cosine and confidence to filter the pairs.

In [None]:
confidence_treshold = 0.90
cosine_threshold = 0.80

In [None]:
# Filter the DataFrame based on the thresholds
filtered_df = df[(df['Confidence'] >= confidence_treshold) & (df['Cosine_Similarity'] >= cosine_threshold)]

len(filtered_df)

These are our positive samples

In [None]:
# add a column label with value 1
filtered_df.loc[:, 'label'] = 1

The following code splits our dataset into train, validation, and test sets and check if there are overlapping samples between the sets.

In [None]:
from sklearn.model_selection import train_test_split

# train val test split 60/20/20
train_val_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

# check that sentences (not just pairs) are disjoint between train/val/test; if not, remove offending rows

def get_sentences(df):
    return set(df['Sentence_1']).union(set(df['Sentence_2']))

train_sentences = get_sentences(train_df)
val_sentences = get_sentences(val_df)
test_sentences = get_sentences(test_df)

# find overlaps
overlap_train_val = train_sentences & val_sentences
overlap_train_test = train_sentences & test_sentences
overlap_val_test = val_sentences & test_sentences

# remove rows with overlapping sentences
def remove_overlaps(df, forbidden_sentences):
    mask = ~df['Sentence_1'].isin(forbidden_sentences) & ~df['Sentence_2'].isin(forbidden_sentences)
    return df[mask]

# iteratively remove overlaps until all sets are disjoint
while True:
    train_sentences = get_sentences(train_df)
    val_sentences = get_sentences(val_df)
    test_sentences = get_sentences(test_df)

    overlap_train_val = train_sentences & val_sentences
    overlap_train_test = train_sentences & test_sentences
    overlap_val_test = val_sentences & test_sentences

    if not (overlap_train_val or overlap_train_test or overlap_val_test):
        break

    if overlap_train_val:
        val_df = remove_overlaps(val_df, overlap_train_val)
        train_df = remove_overlaps(train_df, overlap_train_val)
    if overlap_train_test:
        test_df = remove_overlaps(test_df, overlap_train_test)
        train_df = remove_overlaps(train_df, overlap_train_test)
    if overlap_val_test:
        test_df = remove_overlaps(test_df, overlap_val_test)
        val_df = remove_overlaps(val_df, overlap_val_test)

# reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# recalculate the split sizes
train_size = len(train_df)
val_size = len(val_df)
test_size = len(test_df)
print(f"Train size: {train_size}")
print(f"Validation size: {val_size}")
print(f"Test size: {test_size}")


We generate the negative samples by randomly sampling sentences from the dataset and pairing them with sentences from the same dataset using the cosine similarity score. 

In [None]:
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
from sklearn.utils import shuffle

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
threshold = 0.45

def generate_negative_samples(split_df):
    sentences_1 = split_df['Sentence_1'].unique()
    sentences_2 = split_df['Sentence_2'].unique()
    negative_df = pd.DataFrame(columns=['Sentence_1', 'Sentence_2', 'Cosine_Similarity', 'label'])
    # Generate negative samples to achieve a 1:2 positive:negative ratio
    num_positive = len(split_df[split_df['label'] == 1])
    num_negative_needed = num_positive * 2
    while len(negative_df) < num_negative_needed:
        s1 = np.random.choice(sentences_1)
        s2 = np.random.choice(sentences_2)
        if s1 == s2:
            continue
        # Check if pair exists in positive samples
        if not (((split_df['Sentence_1'] == s1) & (split_df['Sentence_2'] == s2)).any() or
                ((split_df['Sentence_1'] == s2) & (split_df['Sentence_2'] == s1)).any()):
            # Check if pair already generated
            if not (((negative_df['Sentence_1'] == s1) & (negative_df['Sentence_2'] == s2)).any() or
                    ((negative_df['Sentence_1'] == s2) & (negative_df['Sentence_2'] == s1)).any()):
                emb_s1 = model.encode(s1)
                emb_s2 = model.encode(s2)
                cosine_similarity = model.similarity(emb_s1, emb_s2)
                if cosine_similarity < threshold:
                    row = pd.DataFrame([{'Sentence_1': s1, 'Sentence_2': s2, 'Cosine_Similarity': cosine_similarity, 'label': 0}])
                    negative_df = pd.concat([negative_df, row], ignore_index=True)
    return negative_df

train_df = pd.concat([train_df, generate_negative_samples(train_df)], ignore_index=True)
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)

val_df = pd.concat([val_df, generate_negative_samples(val_df)], ignore_index=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)

test_df = pd.concat([test_df, generate_negative_samples(test_df)], ignore_index=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# mantain only Sentence_1, Sentence_2, label columns
train_df = train_df[['Sentence_1', 'Sentence_2', 'label']]
val_df = val_df[['Sentence_1', 'Sentence_2', 'label']]
test_df = test_df[['Sentence_1', 'Sentence_2', 'label']]

# save the dataframes to csv
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)
test_df.to_csv('test.csv', index=False)

Check for final possible overlaps

In [None]:
import pandas as pd

def load(path):  
    df = pd.read_csv(path)  
    # crea coppie ordinate e unordered
    pairs = set(zip(df.Sentence_1, df.Sentence_2))
    upairs = set(frozenset([s1,s2]) for s1,s2 in pairs)
    return df, pairs, upairs

train, train_pairs, train_up = load("train.csv")
val,   val_pairs,   val_up   = load("val.csv")
test,  test_pairs,  test_up  = load("test.csv")

def inter(a, b):
    return len(a & b)

print("== Pair-level overlap ==")
print(f"train ∩ test  (ordered):     {inter(train_pairs, test_pairs)}")
print(f"train ∩ val   (ordered):     {inter(train_pairs, val_pairs)}")
print(f"val   ∩ test  (ordered):     {inter(val_pairs, test_pairs)}")

print(f"train ∩ test  (unordered):   {inter(train_up, test_up)}")
print(f"train ∩ val   (unordered):   {inter(train_up, val_up)}")
print(f"val   ∩ test  (unordered):   {inter(val_up, test_up)}")

print("\n== Sentence-level overlap ==")
s_train = set(train.Sentence_1).union(train.Sentence_2)
s_val   = set(val.Sentence_1).union(val.Sentence_2)
s_test  = set(test.Sentence_1).union(test.Sentence_2)
print(f"sentences train ∩ test:       {len(s_train & s_test)}")
print(f"sentences train ∩ val:        {len(s_train & s_val)}")
print(f"sentences val   ∩ test:       {len(s_val   & s_test)}")

print("\n== Duplicati interni ==")
print(f"train dup ordered:   {len(train) - len(train_pairs)}")
print(f"val   dup ordered:   {len(val)   - len(val_pairs)}")
print(f"test  dup ordered:   {len(test)  - len(test_pairs)}")

print("\n== Label distribution ==")
for name, df in [("train", train), ("val", val), ("test", test)]:
    print(f"{name}:")
    print(df.label.value_counts(normalize=True), "\n")

The following section trains a logistic classifier. It is used to validate the idea that our dataset is too simple to be used with UmBERTo.

In [2]:
import pandas as pd

train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
import numpy as np

# prepare text data by concatenating the two sentences
X_train_text = train['Sentence_1'] + ' ' + train['Sentence_2']
y_train       = train['label']
X_test_text  = test['Sentence_1'] + ' ' + test['Sentence_2']
y_test        = test['label']

# potenziamento TF–IDF con word- e char-gram
word_tfidf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,3),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True
)
char_tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2,5),
    min_df=1,
    max_df=0.9,
    sublinear_tf=True
)
feats = FeatureUnion([('word', word_tfidf), ('char', char_tfidf)])

# pipeline con LogisticRegression su feature union
pipe = Pipeline([
    ('feats', feats),
    ('lr', LogisticRegression(
        max_iter=2000,
        class_weight='balanced'
    ))
])

param_grid = {
    'feats__word__ngram_range':  [(1,2), (1,3)],
    'feats__word__min_df':       [1, 2],
    'feats__word__max_df':       [0.9],
    'feats__word__sublinear_tf': [True],
    'lr__C':                     [0.1, 1, 10],
    'lr__penalty':               ['l2'],
    'lr__solver':                ['liblinear']
}

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_text, y_train)

print('best params:', grid.best_params_)
y_pred = grid.predict(X_test_text)
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 12 candidates, totalling 36 fits
best params: {'feats__word__max_df': 0.9, 'feats__word__min_df': 2, 'feats__word__ngram_range': (1, 2), 'feats__word__sublinear_tf': True, 'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.93      0.88      0.90      1530
           1       0.89      0.93      0.91      1530

    accuracy                           0.91      3060
   macro avg       0.91      0.91      0.91      3060
weighted avg       0.91      0.91      0.91      3060

