# Paraphrase Italian Dataset

In [13]:
import pandas as pd
import numpy as np

In [14]:
# Use 'skip' to skip problematic lines and continue parsing
df = pd.read_csv('PACCSS-IT.txt', sep='\t', usecols=range(7), quoting=3)
df.head()

Unnamed: 0,Sentence_1,Sentence_2,Cosine_Similarity,Confidence,Readability_1,Readability_2,(Readability_1-Readability_2)
0,Ma questo a cosa servirebbe ?,A che servono queste cose ?,0.833333,0.803743,0.994767,0.013161,0.981605
1,"Salve, avrei bisogno di una informazione piutt...",Ho bisogno di una informazione urgente .,0.83666,0.811129,0.982944,0.012627,0.970317
2,Ciao a tutti avrei bisogno di un consiglio .,Ho bisogno di un suo consiglio .,0.755929,0.811662,0.984332,0.01442,0.969912
3,Possibilmente uno che avesse bisogno dell' aiu...,Ho bisogno di un vostro aiuto .,0.801784,0.82683,0.982555,0.014647,0.967908
4,Questa sarebbe una cosa positiva.,Questa era una nuova cosa .,0.771517,0.800786,0.997256,0.030448,0.966808


In [15]:
min_confidence = df['Confidence'].min()
max_confidence = df['Confidence'].max()
min_cosine = df['Cosine_Similarity'].min()
max_cosine = df['Cosine_Similarity'].max()

# Display the results
print(f"Confidence range: {min_confidence} to {max_confidence}")
print(f"Cosine Similarity range: {min_cosine} to {max_cosine}")

Confidence range: 0.800002575466 to 0.9999999
Cosine Similarity range: 0.654653670708 to 0.949288905069


## Positive Class Creation

In [16]:
# drop the last three columns
df = df.iloc[:, :-3]
df.head()

Unnamed: 0,Sentence_1,Sentence_2,Cosine_Similarity,Confidence
0,Ma questo a cosa servirebbe ?,A che servono queste cose ?,0.833333,0.803743
1,"Salve, avrei bisogno di una informazione piutt...",Ho bisogno di una informazione urgente .,0.83666,0.811129
2,Ciao a tutti avrei bisogno di un consiglio .,Ho bisogno di un suo consiglio .,0.755929,0.811662
3,Possibilmente uno che avesse bisogno dell' aiu...,Ho bisogno di un vostro aiuto .,0.801784,0.82683
4,Questa sarebbe una cosa positiva.,Questa era una nuova cosa .,0.771517,0.800786


In [17]:
confidence_treshold = 0.90
cosine_threshold = 0.80

In [18]:
# Filter the DataFrame based on the thresholds
filtered_df = df[(df['Confidence'] >= confidence_treshold) & (df['Cosine_Similarity'] >= cosine_threshold)]

len(filtered_df)

14545

In [19]:
# add a column label with value 1
filtered_df.loc[:, 'label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'label'] = 1


In [None]:
from sklearn.model_selection import train_test_split

# train val test split 60/20/20
train_val_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

# check that sentences (not just pairs) are disjoint between train/val/test; if not, remove offending rows

def get_sentences(df):
    return set(df['Sentence_1']).union(set(df['Sentence_2']))

train_sentences = get_sentences(train_df)
val_sentences = get_sentences(val_df)
test_sentences = get_sentences(test_df)

# find overlaps
overlap_train_val = train_sentences & val_sentences
overlap_train_test = train_sentences & test_sentences
overlap_val_test = val_sentences & test_sentences

# remove rows with overlapping sentences
def remove_overlaps(df, forbidden_sentences):
    mask = ~df['Sentence_1'].isin(forbidden_sentences) & ~df['Sentence_2'].isin(forbidden_sentences)
    return df[mask]

# iteratively remove overlaps until all sets are disjoint
while True:
    train_sentences = get_sentences(train_df)
    val_sentences = get_sentences(val_df)
    test_sentences = get_sentences(test_df)

    overlap_train_val = train_sentences & val_sentences
    overlap_train_test = train_sentences & test_sentences
    overlap_val_test = val_sentences & test_sentences

    if not (overlap_train_val or overlap_train_test or overlap_val_test):
        break

    if overlap_train_val:
        val_df = remove_overlaps(val_df, overlap_train_val)
        train_df = remove_overlaps(train_df, overlap_train_val)
    if overlap_train_test:
        test_df = remove_overlaps(test_df, overlap_train_test)
        train_df = remove_overlaps(train_df, overlap_train_test)
    if overlap_val_test:
        test_df = remove_overlaps(test_df, overlap_val_test)
        val_df = remove_overlaps(val_df, overlap_val_test)

# reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# recalculate the split sizes
train_size = len(train_df)
val_size = len(val_df)
test_size = len(test_df)
print(f"Train size: {train_size}")
print(f"Validation size: {val_size}")
print(f"Test size: {test_size}")


Train size: 4128
Validation size: 293
Test size: 708


In [2]:
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
from sklearn.utils import shuffle

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
threshold = 0.45

def generate_negative_samples(split_df):
    sentences_1 = split_df['Sentence_1'].unique()
    sentences_2 = split_df['Sentence_2'].unique()
    negative_df = pd.DataFrame(columns=['Sentence_1', 'Sentence_2', 'Cosine_Similarity', 'label'])
    # Generate negative samples to achieve a 1:2 positive:negative ratio
    num_positive = len(split_df[split_df['label'] == 1])
    num_negative_needed = num_positive * 2
    while len(negative_df) < num_negative_needed:
        s1 = np.random.choice(sentences_1)
        s2 = np.random.choice(sentences_2)
        if s1 == s2:
            continue
        # Check if pair exists in positive samples
        if not (((split_df['Sentence_1'] == s1) & (split_df['Sentence_2'] == s2)).any() or
                ((split_df['Sentence_1'] == s2) & (split_df['Sentence_2'] == s1)).any()):
            # Check if pair already generated
            if not (((negative_df['Sentence_1'] == s1) & (negative_df['Sentence_2'] == s2)).any() or
                    ((negative_df['Sentence_1'] == s2) & (negative_df['Sentence_2'] == s1)).any()):
                emb_s1 = model.encode(s1)
                emb_s2 = model.encode(s2)
                cosine_similarity = model.similarity(emb_s1, emb_s2)
                if cosine_similarity < threshold:
                    row = pd.DataFrame([{'Sentence_1': s1, 'Sentence_2': s2, 'Cosine_Similarity': cosine_similarity, 'label': 0}])
                    negative_df = pd.concat([negative_df, row], ignore_index=True)
    return negative_df

train_df = pd.concat([train_df, generate_negative_samples(train_df)], ignore_index=True)
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)

val_df = pd.concat([val_df, generate_negative_samples(val_df)], ignore_index=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)

test_df = pd.concat([test_df, generate_negative_samples(test_df)], ignore_index=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# mantain only Sentence_1, Sentence_2, label columns
train_df = train_df[['Sentence_1', 'Sentence_2', 'label']]
val_df = val_df[['Sentence_1', 'Sentence_2', 'label']]
test_df = test_df[['Sentence_1', 'Sentence_2', 'label']]

# save the dataframes to csv
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)
test_df.to_csv('test.csv', index=False)



KeyboardInterrupt: 

In [1]:
import pandas as pd

def load(path):  
    df = pd.read_csv(path)  
    # crea coppie ordinate e unordered
    pairs = set(zip(df.Sentence_1, df.Sentence_2))
    upairs = set(frozenset([s1,s2]) for s1,s2 in pairs)
    return df, pairs, upairs

train, train_pairs, train_up = load("train.csv")
val,   val_pairs,   val_up   = load("val.csv")
test,  test_pairs,  test_up  = load("test.csv")

def inter(a, b):
    return len(a & b)

print("== Pair-level overlap ==")
print(f"train ∩ test  (ordered):     {inter(train_pairs, test_pairs)}")
print(f"train ∩ val   (ordered):     {inter(train_pairs, val_pairs)}")
print(f"val   ∩ test  (ordered):     {inter(val_pairs, test_pairs)}")

print(f"train ∩ test  (unordered):   {inter(train_up, test_up)}")
print(f"train ∩ val   (unordered):   {inter(train_up, val_up)}")
print(f"val   ∩ test  (unordered):   {inter(val_up, test_up)}")

print("\n== Sentence-level overlap ==")
s_train = set(train.Sentence_1).union(train.Sentence_2)
s_val   = set(val.Sentence_1).union(val.Sentence_2)
s_test  = set(test.Sentence_1).union(test.Sentence_2)
print(f"sentences train ∩ test:       {len(s_train & s_test)}")
print(f"sentences train ∩ val:        {len(s_train & s_val)}")
print(f"sentences val   ∩ test:       {len(s_val   & s_test)}")

print("\n== Duplicati interni ==")
print(f"train dup ordered:   {len(train) - len(train_pairs)}")
print(f"val   dup ordered:   {len(val)   - len(val_pairs)}")
print(f"test  dup ordered:   {len(test)  - len(test_pairs)}")

print("\n== Label distribution ==")
for name, df in [("train", train), ("val", val), ("test", test)]:
    print(f"{name}:")
    print(df.label.value_counts(normalize=True), "\n")

== Pair-level overlap ==
train ∩ test  (ordered):     0
train ∩ val   (ordered):     0
val   ∩ test  (ordered):     0
train ∩ test  (unordered):   0
train ∩ val   (unordered):   0
val   ∩ test  (unordered):   0

== Sentence-level overlap ==
sentences train ∩ test:       0
sentences train ∩ val:        0
sentences val   ∩ test:       0

== Duplicati interni ==
train dup ordered:   0
val   dup ordered:   0
test  dup ordered:   0

== Label distribution ==
train:
label
1    0.5
0    0.5
Name: proportion, dtype: float64 

val:
label
0    0.5
1    0.5
Name: proportion, dtype: float64 

test:
label
0    0.5
1    0.5
Name: proportion, dtype: float64 



In [9]:
train = pd.read_csv("train.csv")
val = pd.read_csv("val.csv")
test = pd.read_csv("test.csv")

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

# prepare text data by concatenating the two sentences
X_train_text = train['Sentence_1'] + ' ' + train['Sentence_2']
y_train       = train['label']
X_test_text  = test['Sentence_1'] + ' ' + test['Sentence_2']
y_test        = test['label']

# potenziamento TF–IDF con word n-gram 1-3, min/max df, sublinear tf
tfidf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    norm='l2'
    )

# pipeline con LogisticRegression bilanciata
pipe = Pipeline([
    ('tfidf', tfidf),
    ('lr', LogisticRegression(
        max_iter=2000,
        class_weight='balanced'
    ))
])

param_grid = {
    'tfidf__ngram_range':    [(1,1), (1,2), (1,3)],
    'tfidf__min_df':         [1, 2],
    'tfidf__max_df':         [0.85, 0.9, 1.0],
    'tfidf__sublinear_tf':   [True, False],
    'lr__C':                 np.logspace(-3, 3, 7),
    'lr__penalty':           ['l2'],
    'lr__solver':            ['liblinear', 'saga']
}

grid = GridSearchCV(
    pipe,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train_text, y_train)

print('best params:', grid.best_params_)
y_pred = grid.predict(X_test_text)
print(classification_report(y_test, y_pred))



Fitting 3 folds for each of 504 candidates, totalling 1512 fits


KeyboardInterrupt: 

In [7]:
# HARD TEST MINI
ht_df = pd.read_csv('../hard_test_mini.csv')
hf_preditions = grid.predict(ht_df['Sentence_1'] + ' ' + ht_df['Sentence_2'])
print(classification_report(ht_df['label'], hf_preditions))

              precision    recall  f1-score   support

           0       0.43      0.20      0.27        15
           1       0.33      0.60      0.43        10

    accuracy                           0.36        25
   macro avg       0.38      0.40      0.35        25
weighted avg       0.39      0.36      0.34        25



In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

svm_param_grid = {
    'tfidf__analyzer':         ['word', 'char_wb'],
    'tfidf__ngram_range':      [(1,1), (1,2), (1,3), (2,3)],
    'tfidf__min_df':           [1, 2, 3],
    'tfidf__max_df':           [0.8, 0.9, 1.0],
    'tfidf__sublinear_tf':     [True, False],
    'svc__C':                  np.logspace(-3, 3, 7),
    'svc__max_iter':           [1000, 2000, 5000]
}

# Prepare features for SVM: concatenate the two sentences for train and val
# (X_train, y_train, X_val, y_val already defined)

svm_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', LinearSVC())
])

svm_grid = GridSearchCV(svm_pipe, svm_param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
svm_grid.fit(X_train_text, y_train)

print("Best SVM params:", svm_grid.best_params_)
y_pred_svm = svm_grid.predict(X_test_text)
print(classification_report(y_test, y_pred_svm))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best SVM params: {'svc__C': 10, 'svc__max_iter': 1000, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 1)}
              precision    recall  f1-score   support

           0       0.87      0.68      0.76      1530
           1       0.74      0.90      0.81      1530

    accuracy                           0.79      3060
   macro avg       0.80      0.79      0.79      3060
weighted avg       0.80      0.79      0.79      3060

