# Paraphrase Italian Dataset

In [13]:
import pandas as pd
import numpy as np

In [14]:
# Use 'skip' to skip problematic lines and continue parsing
df = pd.read_csv('PACCSS-IT.txt', sep='\t', usecols=range(7), quoting=3)
df.head()

Unnamed: 0,Sentence_1,Sentence_2,Cosine_Similarity,Confidence,Readability_1,Readability_2,(Readability_1-Readability_2)
0,Ma questo a cosa servirebbe ?,A che servono queste cose ?,0.833333,0.803743,0.994767,0.013161,0.981605
1,"Salve, avrei bisogno di una informazione piutt...",Ho bisogno di una informazione urgente .,0.83666,0.811129,0.982944,0.012627,0.970317
2,Ciao a tutti avrei bisogno di un consiglio .,Ho bisogno di un suo consiglio .,0.755929,0.811662,0.984332,0.01442,0.969912
3,Possibilmente uno che avesse bisogno dell' aiu...,Ho bisogno di un vostro aiuto .,0.801784,0.82683,0.982555,0.014647,0.967908
4,Questa sarebbe una cosa positiva.,Questa era una nuova cosa .,0.771517,0.800786,0.997256,0.030448,0.966808


In [15]:
min_confidence = df['Confidence'].min()
max_confidence = df['Confidence'].max()
min_cosine = df['Cosine_Similarity'].min()
max_cosine = df['Cosine_Similarity'].max()

# Display the results
print(f"Confidence range: {min_confidence} to {max_confidence}")
print(f"Cosine Similarity range: {min_cosine} to {max_cosine}")

Confidence range: 0.800002575466 to 0.9999999
Cosine Similarity range: 0.654653670708 to 0.949288905069


## Positive Class Creation

In [16]:
# drop the last three columns
df = df.iloc[:, :-3]
df.head()

Unnamed: 0,Sentence_1,Sentence_2,Cosine_Similarity,Confidence
0,Ma questo a cosa servirebbe ?,A che servono queste cose ?,0.833333,0.803743
1,"Salve, avrei bisogno di una informazione piutt...",Ho bisogno di una informazione urgente .,0.83666,0.811129
2,Ciao a tutti avrei bisogno di un consiglio .,Ho bisogno di un suo consiglio .,0.755929,0.811662
3,Possibilmente uno che avesse bisogno dell' aiu...,Ho bisogno di un vostro aiuto .,0.801784,0.82683
4,Questa sarebbe una cosa positiva.,Questa era una nuova cosa .,0.771517,0.800786


In [17]:
confidence_treshold = 0.90
cosine_threshold = 0.80

In [18]:
# Filter the DataFrame based on the thresholds
filtered_df = df[(df['Confidence'] >= confidence_treshold) & (df['Cosine_Similarity'] >= cosine_threshold)]

len(filtered_df)

14545

In [19]:
# add a column label with value 1
filtered_df.loc[:, 'label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'label'] = 1


In [20]:
from sklearn.model_selection import train_test_split

# train val test split 70/20/10
train_val_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=10/80, random_state=42)

# check that sentences (not just pairs) are disjoint between train/val/test; if not, remove offending rows

def get_sentences(df):
    return set(df['Sentence_1']).union(set(df['Sentence_2']))

train_sentences = get_sentences(train_df)
val_sentences = get_sentences(val_df)
test_sentences = get_sentences(test_df)

# find overlaps
overlap_train_val = train_sentences & val_sentences
overlap_train_test = train_sentences & test_sentences
overlap_val_test = val_sentences & test_sentences

# remove rows with overlapping sentences
def remove_overlaps(df, forbidden_sentences):
    mask = ~df['Sentence_1'].isin(forbidden_sentences) & ~df['Sentence_2'].isin(forbidden_sentences)
    return df[mask]

# iteratively remove overlaps until all sets are disjoint
while True:
    train_sentences = get_sentences(train_df)
    val_sentences = get_sentences(val_df)
    test_sentences = get_sentences(test_df)

    overlap_train_val = train_sentences & val_sentences
    overlap_train_test = train_sentences & test_sentences
    overlap_val_test = val_sentences & test_sentences

    if not (overlap_train_val or overlap_train_test or overlap_val_test):
        break

    if overlap_train_val:
        val_df = remove_overlaps(val_df, overlap_train_val)
        train_df = remove_overlaps(train_df, overlap_train_val)
    if overlap_train_test:
        test_df = remove_overlaps(test_df, overlap_train_test)
        train_df = remove_overlaps(train_df, overlap_train_test)
    if overlap_val_test:
        test_df = remove_overlaps(test_df, overlap_val_test)
        val_df = remove_overlaps(val_df, overlap_val_test)

# reset indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# recalculate the split sizes
train_size = len(train_df)
val_size = len(val_df)
test_size = len(test_df)
print(f"Train size: {train_size}")
print(f"Validation size: {val_size}")
print(f"Test size: {test_size}")


Train size: 4128
Validation size: 293
Test size: 708


In [21]:
def generate_random_sentence_pair():
    # pick a random Sentence_1
    s1 = df.sample(n=1)['Sentence_1'].values[0]
    # pick a random Sentence_2
    s2 = df.sample(n=1)['Sentence_2'].values[0]
    return s1, s2


In [23]:
from sentence_transformers import SentenceTransformer
from numpy import dot
from numpy.linalg import norm
from sklearn.utils import shuffle

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
threshold = 0.70

def generate_negative_samples(split_df):
    sentences_1 = split_df['Sentence_1'].unique()
    sentences_2 = split_df['Sentence_2'].unique()
    negative_df = pd.DataFrame(columns=['Sentence_1', 'Sentence_2', 'Cosine_Similarity', 'label'])
    while len(negative_df) < len(split_df):
        s1 = np.random.choice(sentences_1)
        s2 = np.random.choice(sentences_2)
        if s1 == s2:
            continue
        # Check if pair exists in positive samples
        if not (((split_df['Sentence_1'] == s1) & (split_df['Sentence_2'] == s2)).any() or
                ((split_df['Sentence_1'] == s2) & (split_df['Sentence_2'] == s1)).any()):
            # Check if pair already generated
            if not (((negative_df['Sentence_1'] == s1) & (negative_df['Sentence_2'] == s2)).any() or
                    ((negative_df['Sentence_1'] == s2) & (negative_df['Sentence_2'] == s1)).any()):
                emb_s1 = model.encode(s1)
                emb_s2 = model.encode(s2)
                cosine_similarity = model.similarity(emb_s1, emb_s2)
                if cosine_similarity < threshold:
                    row = pd.DataFrame([{'Sentence_1': s1, 'Sentence_2': s2, 'Cosine_Similarity': cosine_similarity, 'label': 0}])
                    negative_df = pd.concat([negative_df, row], ignore_index=True)
    return negative_df

train_df = pd.concat([train_df, generate_negative_samples(train_df)], ignore_index=True)
train_df = shuffle(train_df, random_state=42).reset_index(drop=True)

val_df = pd.concat([val_df, generate_negative_samples(val_df)], ignore_index=True)
val_df = shuffle(val_df, random_state=42).reset_index(drop=True)

test_df = pd.concat([test_df, generate_negative_samples(test_df)], ignore_index=True)
test_df = shuffle(test_df, random_state=42).reset_index(drop=True)

# mantain only Sentence_1, Sentence_2, label columns
train_df = train_df[['Sentence_1', 'Sentence_2', 'label']]
val_df = val_df[['Sentence_1', 'Sentence_2', 'label']]
test_df = test_df[['Sentence_1', 'Sentence_2', 'label']]

# save the dataframes to csv
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)
test_df.to_csv('test.csv', index=False)