# **1. Introduction**

In [None]:
# Install the necessary libraries
%pip install pandas
%pip install nltk

In [1]:
# Import the required libraries

# Standard Libraries
import os

# Data manipulation and analysis
import pandas as pd

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
import string

# Machine Learning
import joblib

# Download necessary NLTK data files
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joanc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joanc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\joanc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joanc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **2. Data Loading**

In this section, we load the training and test datasets along with their corresponding gold standard similarity scores from various corpora. We begin by installing and importing the necessary libraries and packages. Next, we define a function to load and process the datasets, and we transform them into structured DataFrames for further analysis.

In [2]:
from scripts.load_dataset import load_dataset

data_dir = 'datasets'

# Load Training Data
train_data = load_dataset(data_dir, dataset_category='train')
columns = ['sentence_0', 'sentence_1', 'score', 'dataset_name']
train_data = pd.DataFrame(train_data, columns=columns)

# Load Test Data
test_data = load_dataset(data_dir, dataset_category='test')
columns = ['sentence_0', 'sentence_1', 'score', 'dataset_name']
test_data = pd.DataFrame(test_data, columns=columns)

# Display the number of samples in each dataset
print(f"Number of training samples: {len(train_data)}")
print(f"Number of test samples: {len(test_data)}")

Number of training samples: 2234
Number of test samples: 3108


# **3. Data Preprocessing**

Once the data is loaded, we apply preprocessing steps to prepare the datasets for feature extraction. Specifically, we:

1. Tokenize Sentences: Use NLTK's `word_tokenize` to split sentences into individual tokens.
2. Remove Punctuation: Eliminate punctuation tokens to retain only meaningful words.
3. Replace Contractions: Replace contractions based on a custom dictionary, inspired by TakenLab's paper. 
4. Lemmatize Words: Apply NLTK's `WordNetLemmatizer` with Part-of-Speech (POS) tagging to reduce words to their base forms.

Tokenize Sentences

In [67]:
train_data['tokens_0'] = train_data['sentence_0'].apply(nltk.word_tokenize)
train_data['tokens_1'] = train_data['sentence_1'].apply(nltk.word_tokenize)

test_data['tokens_0'] = test_data['sentence_0'].apply(nltk.word_tokenize)
test_data['tokens_1'] = test_data['sentence_1'].apply(nltk.word_tokenize)

Remove Punctuation

In [68]:
train_data['tokens_0'] = train_data['tokens_0'].apply(lambda tokens: [word for word in tokens if any(char.isalnum() for char in word)])
train_data['tokens_1'] = train_data['tokens_1'].apply(lambda tokens: [word for word in tokens if any(char.isalnum() for char in word)])

test_data['tokens_0'] = test_data['tokens_0'].apply(lambda tokens: [word for word in tokens if any(char.isalnum() for char in word)])
test_data['tokens_1'] = test_data['tokens_1'].apply(lambda tokens: [word for word in tokens if any(char.isalnum() for char in word)])

Replace Contractions

In [None]:
from scripts.preprocessing import replace_contractions

train_data['tokens_0'] = train_data['tokens_0'].apply(replace_contractions)
train_data['tokens_1'] = train_data['tokens_1'].apply(replace_contractions)

test_data['tokens_0'] = test_data['tokens_0'].apply(replace_contractions)
test_data['tokens_1'] = test_data['tokens_1'].apply(replace_contractions)

Lemmatize Words

In [None]:
from scripts.preprocessing import lemmatize

# Obtain Lemmatized Words
train_data['lemmas_0'] = train_data.apply(lambda row: lemmatize(row, "tokens_0", True), axis=1)
train_data['lemmas_1'] = train_data.apply(lambda row: lemmatize(row, "tokens_1", True), axis=1)

test_data['lemmas_0'] = test_data.apply(lambda row: lemmatize(row, "tokens_0", True), axis=1)
test_data['lemmas_1'] = test_data.apply(lambda row: lemmatize(row, "tokens_1", True), axis=1)

# Join Lemmatized Words
train_data['sentence_lemmas_0'] = train_data.apply(lambda row: " ".join(row["lemmas_0"]), axis=1)
train_data['sentence_lemmas_1'] = train_data.apply(lambda row: " ".join(row["lemmas_1"]), axis=1)

test_data['sentence_lemmas_0'] = test_data.apply(lambda row: " ".join(row["lemmas_0"]), axis=1)
test_data['sentence_lemmas_1'] = test_data.apply(lambda row: " ".join(row["lemmas_1"]), axis=1)

Save and Load Preprocessed Datasets

In [4]:
# Save Training and Test Data
train_data.to_csv('datasets/train_preprocessed.csv')
test_data.to_csv('datasets/test_preprocessed.csv')

In [3]:
# Load Training and Test Data
train_data = pd.read_csv('datasets/train_preprocessed.csv')
test_data = pd.read_csv('datasets/test_preprocessed.csv')

# **4. Feature Extraction**

# **5. Feature Computation**

In [None]:
from scripts.features import harmonic_mean, P_WN

def compute_features(data):
    """
    Computes a comprehensive set of similarity features for each pair of sentences in the dataset.

    Parameters:
        data (pd.DataFrame): DataFrame containing 'sentence_lemmas_0' and 'sentence_lemmas_1' columns.

    Returns:
        pd.DataFrame: DataFrame with computed similarity features.
    """
    features = pd.DataFrame()

    # Similarity Features
    features['longest_common_substring'] = data.apply(lambda row: longest_common_substring(row["sentence_lemmas_0"], row["sentence_lemmas_1"]), axis=1)
    features['longest_common_subsequence'] = data.apply(lambda row: longest_common_subsequence(row["sentence_lemmas_0"], row["sentence_lemmas_1"]), axis=1)
    features['greedy_string_tiling'] = data.apply(lambda row: optimized_gst(row["sentence_lemmas_0"], row["sentence_lemmas_1"], min_match_length=1), axis=1)

    # Character n-gram Similarity Features
    features['2_gram_char'] = data.apply(lambda row: similarity_char_ngrams(row["lemmas_0"], row["lemmas_1"], 2), axis=1)
    features['3_gram_char'] = data.apply(lambda row: similarity_char_ngrams(row["lemmas_0"], row["lemmas_1"], 3), axis=1)
    features['4_gram_char'] = data.apply(lambda row: similarity_char_ngrams(row["lemmas_0"], row["lemmas_1"], 4), axis=1)

    # Word n-gram Jaccard Similarity Features
    features['1_gram_word_Jaccard'] = data.apply(lambda row: similarity_words_ngrams_jaccard(row["lemmas_0"], row["lemmas_1"], 1), axis=1)
    features['3_gram_word_Jaccard'] = data.apply(lambda row: similarity_words_ngrams_jaccard(row["lemmas_0"], row["lemmas_1"], 3), axis=1)
    features['4_gram_word_Jaccard'] = data.apply(lambda row: similarity_words_ngrams_jaccard(row["lemmas_0"], row["lemmas_1"], 4), axis=1)

    # Word n-gram Jaccard Similarity Features without Stopwords
    features['2_gram_word_Jaccard_without_SW'] = data.apply(lambda row: similarity_words_ngrams_jaccard(row["lemmas_0"], row["lemmas_1"], 2, use_stopwords=True), axis=1)
    features['4_gram_word_Jaccard_without_SW'] = data.apply(lambda row: similarity_words_ngrams_jaccard(row["lemmas_0"], row["lemmas_1"], 4, use_stopwords=True), axis=1)

    # Word n-gram Containment Similarity Features without Stopwords (a)
    features['1_gram_word_Containment_without_SW_a'] = data.apply(lambda row: similarity_words_ngrams_containment(row["lemmas_0"], row["lemmas_1"], 1, use_stopwords=True), axis=1)
    features['2_gram_word_Containment_without_SW_a'] = data.apply(lambda row: similarity_words_ngrams_containment(row["lemmas_0"], row["lemmas_1"], 2, use_stopwords=True), axis=1)

    # Word n-gram Containment Similarity Features without Stopwords (b)
    features['1_gram_word_Containment_without_SW_b'] = data.apply(lambda row: similarity_words_ngrams_containment(row["lemmas_1"], row["lemmas_0"], 1, use_stopwords=True), axis=1)
    features['2_gram_word_Containment_without_SW_b'] = data.apply(lambda row: similarity_words_ngrams_containment(row["lemmas_1"], row["lemmas_0"], 2, use_stopwords=True), axis=1)

    features['average_similarity'] = data.apply(lambda row: average_similarity(row["lemmas_0"], row["lemmas_1"]), axis=1)

    # Lexical Substitution System Feature
    data['lemmas_with_disambiguation_0'] = data.apply(lambda row: tokens_to_synsets_name(row["lemmas_0"]), axis=1)
    data['lemmas_with_disambiguation_1'] = data.apply(lambda row: tokens_to_synsets_name(row["lemmas_1"]), axis=1)
    features['lexical_substitution_system'] = data.apply(lambda row: similarity_lemmas(row['lemmas_with_disambiguation_0'], row['lemmas_with_disambiguation_1']), axis=1)


    # WordNet-Augmented Word Overlap (TakeLab)
    features['wordnet_augmented_overlap'] = data.apply(lambda row: harmonic_mean(
                P_WN([w for w in row["lemmas_0"] if w not in stopwords], [w for w in row["lemmas_1"] if w not in stopwords]),
                P_WN([w for w in row["lemmas_1"] if w not in stopwords], [w for w in row["lemmas_0"] if w not in stopwords])
            ), axis=1
    )

In [None]:
features_train = compute_features(train_data)
features_test = compute_features(test_data)

In [6]:
# Save Training Features to a CSV File
features_train.to_csv('features/features_train.csv', index=False)

In [6]:
# Save Test Features to a CSV File
features_test.to_csv('features/features_test.csv', index=False)

In [4]:
# Load Training and Test Features
features_train = pd.read_csv('features/features_train.csv')
features_test  = pd.read_csv('features/features_test.csv')

In [None]:
from scripts.features import harmonic_mean, P_WN

# Add the new feature using the wordnet_augmented_word_overlap function
features_test['wordnet_augmented_overlap'] = test_data.apply(lambda row: harmonic_mean(
                P_WN([w for w in row["lemmas_0"] if w not in stopwords], [w for w in row["lemmas_1"] if w not in stopwords]),
                P_WN([w for w in row["lemmas_1"] if w not in stopwords], [w for w in row["lemmas_0"] if w not in stopwords])
            ), axis=1
    )

In [68]:
from nltk.corpus import wordnet as wn

def P_WN(S1, S2):
    """
    Compute P_WN(S1, S2) metric as described in the TakeLab paper.

    Parameters:
        S1 (list): List of tokenized words from the first sentence.
        S2 (list): List of tokenized words from the second sentence.

    Returns:
        float: The computed P_WN(S1, S2) value.
    """
    if len(S1) == 0:
        return 0.0

    score = 0.0
    S2_set = set(S2) # Slight optimization for membership checks
    for word1 in S1:
        if word1 in S2_set:
            score += 1.0
        else:
            # Find the best similarity if exact match is not found
            best_sim = max((wordnet_path_similarity(word1, word2) for word2 in S2), default=0.0)
            score += best_sim

    return score / len(S1)

def wordnet_path_similarity(word1, word2):
    """
    Compute the maximum WordNet path similarity between all synset pairs of two given words.
    Only consider synsets that share the same part-of-speech (POS).

    Parameters:
        word1 (str): First word.
        word2 (str): Second word.

    Returns:
        float: Maximum path similarity between word1 and word2.
    """
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)

    # Consider only pairs with matching POS, and take the maximum similarity
    max_sim = 0.0
    for s1 in synsets1:
        for s2 in synsets2:
            if s1.pos() == s2.pos():
                sim = s1.path_similarity(s2)
                if sim is not None and sim > max_sim:
                    max_sim = sim
    return max_sim

def harmonic_mean(x, y):
    """
    Compute the harmonic mean of two numbers.

    Parameters:
        x (float): First number.
        y (float): Second number.

    Returns:
        float: The harmonic mean of the two numbers.
    """
    if (x + y) > 0:
        return 2 * x * y / (x + y)
    else:
        return 0

In [36]:
features_train.head()

Unnamed: 0.1,Unnamed: 0,longest_common_substring,longest_common_subsequence,greedy_string_tiling,2_gram_char,3_gram_char,4_gram_char,1_gram_word_Jaccard,3_gram_word_Jaccard,4_gram_word_Jaccard,2_gram_word_Jaccard_without_SW,4_gram_word_Jaccard_without_SW,1_gram_word_Containment_without_SW_a,2_gram_word_Containment_without_SW_a,1_gram_word_Containment_without_SW_b,2_gram_word_Containment_without_SW_b,average_similarity,lexical_substitution_system,wordnet_augmented_overlap
0,0,69,97,69,0.79731,0.72229,0.63253,0.533333,0.342857,0.323529,0.444444,0.375,0.5625,0.533333,0.75,0.727273,0.633658,0.772088,0.773645
1,1,24,49,24,0.666997,0.515727,0.46188,0.388889,0.047619,0.0,0.307692,0.181818,0.857143,0.666667,0.5,0.363636,0.780363,0.521026,0.659982
2,2,25,52,25,0.700097,0.547782,0.434828,0.333333,0.074074,0.038462,0.058824,0.0,0.555556,0.125,0.454545,0.1,0.574525,0.799408,0.619091
3,3,124,127,124,0.92647,0.852009,0.827556,0.607143,0.576923,0.56,0.555556,0.5,0.916667,0.909091,0.611111,0.588235,0.82716,0.805629,0.808532
4,4,20,57,20,0.597739,0.367355,0.282144,0.192308,0.0,0.0,0.05,0.0,0.25,0.090909,0.272727,0.1,0.547487,0.46291,0.492864


# **6. Feature Visualization**