#Project : Semantic Textual Similarity
**Authors:**

* Ramón Mateo Navarro
* Benet Manzanares Salor

##Installation ans imports

In [None]:
import os
import numpy as np
import pandas as pd
import re
from argparse import Namespace
from functools import partial
from itertools import chain

from google.colab import drive

import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Settings 



In [None]:
settings = Namespace()

# Paths
settings.mount_path = "/content/drive/"
drive.mount(settings.mount_path, force_remount=True)
settings.drive_path = os.path.join(settings.mount_path, "MyDrive")
settings.project_folder = "TO DO: Teacher PATH" #@param ["Benet_MAI/S1/IHLT/IHLT_Labs/Projecte", "MAI/IHLT/IHTL_Labs/Projecte", "TO DO: Teacher PATH"]
settings.project_path = os.path.join(settings.drive_path, settings.project_folder)
settings.data_path = os.path.join(settings.project_path, "Data")
settings.train_data_path = os.path.join(settings.data_path, "train")
settings.test_data_path = os.path.join(settings.data_path, "test-gold")

Mounted at /content/drive/


## Data

In [None]:
############# Functions #############
def load_dataframe(input_filepath):
  """
  Load dataframe for a given input file path
  @param input_filepath: path to the input file
  @return a Pandas' dataframe with the sentence pairs and Gold Standard as columns
  """
  current_file_path = input_filepath
  try:
    # Read inputs
    data = []
    with open(input_filepath, 'r') as f:
      lines = f.read().splitlines()
      for line in lines:
        data.append(line.split("\t"))
    df = pd.DataFrame(data, columns = [0, 1])
    
    # Read Gold Standard
    current_file_path = re.sub("input", "gs", input_filepath)
    df["gs"] = pd.read_csv(current_file_path, sep='\t', header=None)

    # Normalize Gold Standard for more intuitive comparisions
    df["gs"] = df["gs"] / 5

  except Exception as e:
    raise Exception(f"ERROR while reading {current_file_path}:\n\t{e}")

  return df


def load_dataset(data_path):
  """
  Load dataset for a given path
  @param data_path: path to the data folder
  @return a dataframe with the content of the corresponding files
  """
  dataset = None
  filenames = os.listdir(data_path)
  for filename in filenames:
    if filename.startswith("STS.input"):
      file_path = os.path.join(data_path, filename)
      try:
        df = load_dataframe(file_path)
        if dataset is None:
          dataset = df
        else:
          dataset = pd.concat([dataset, df], ignore_index=True)
      except Exception as e:
        print(e)
  
  return dataset

In [None]:
############# Read datasets #############
train_dataset = load_dataset(settings.train_data_path)
test_dataset = load_dataset(settings.test_data_path)

## Common

### Preprocessing

In [None]:
############# Auxiliar objects #############
nlp = spacy.load('en_core_web_sm')
special_characters_pattern = re.compile(r"[^ \nA-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ/]+")
stop_words = stopwords.words('english')

In [None]:
############# Functions #############
def remove_special_chars(sentence):
  """
  Remove special chars
  @param sentence: string to remove special characters from
  @return a string with special characters removed
  """
  # Forming contractions  (Reference webs: https://www.thefreedictionary.com/Forming-Contractions.htm and https://www.gymglish.com/en/gymglish/english-grammar/forming-contractions-arent-cant-id-youre-etc)
  sentence = sentence.replace("n't", " not")
  sentence = sentence.replace("'m", " am")  
  sentence = sentence.replace("'re", " are")
  sentence = sentence.replace("'ll", " will")  
  sentence = sentence.replace("'ve", " have")

  # Special characters
  sentence = re.sub(special_characters_pattern, " ", sentence)  

  return sentence


def tokenize(sentence):
  """
  Tokenize sentence
  @param sentence: sentence to be tokenized
  @return a list with all the SpaCy tokens
  """
  
  return nlp.tokenizer(sentence)


def remove_stopwords(tokens):
  """
  Remove stop words in a list of tokens.
  @param tokens: SpaCy tokens to delete stop words
  @return tokens without stopwords
  """
  return list(filter(lambda x: x.text not in stop_words, tokens))

### Lemmatization and synsets

In [None]:
############# Auxiliar objects #############
wnl = WordNetLemmatizer()
PoS_to_WN_dict = {
    # NLTK to WordNet
    "NN": "n",
    "NNS": "n",
    "NNP": "n",
    "NNPS": "n",
    "VB": "v",
    "VBD": "v",
    "VBG": "v",
    "VBN": "v",
    "VBP": "v",
    "VBZ": "v",
    "RB": "r",
    "RBR": "r",
    "RBS": "r",
    "JJ": "a",
    "JJR": "a",
    "JJS": "a",
    # SpaCy to WordNet
    "ADJ": "a",
    "ADV": "r",
    "AUX": "v",
    "NOUN": "n",
    "VERB": "v",
}

In [None]:
############# Functions #############
def lemmatize(p):
  """
  @param p: a tuple of the form (word, tag)
  @return: the lemma of the word
  """
  if p[1][0] in PoS_to_WN_dict.keys():
    return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
  return p[0]


def get_lemmas(sentence):
  """ 
  @param sentece: a sentence
  @return: a list of lemmas and tags
  """
  words = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(words)
  lemmas = [lemmatize(pair) for pair in tags]
  return lemmas, tags


def get_synsets(lemmas, tags):
  """
  Get synsets for a given list of lemmas 
  @param lemmas: list with lemmas
  @param tags: list with tags
  @return: list with synsets
  """
  synsets = []
  for i in range(len(lemmas)):
    lemma = lemmas[i]
    if(tags[i][1] in PoS_to_WN_dict.keys()):      
      synset = nltk.wsd.lesk(lemmas, lemma, PoS_to_WN_dict[tags[i][1]])
      if synset is not None:
        synsets.append(synset)

  return synsets

### Lemmas information content

In [None]:
############# Compute lemmas dictionary #############
sentences = list(train_dataset[0])
sentences += list(train_dataset[1])
lemmas_dict = {}
num_lemmas = 0
for sent in sentences:
  for token in tokenize(sent):
    lemma = token.lemma_.lower()
    lemmas_dict[lemma] = lemmas_dict.get(lemma, 0) + 1
    num_lemmas += 1

############# Compute lemmas information content dictionary #############
lemmas_info_dict = {}
maximum_lemma_info = 0
for lemma, count in lemmas_dict.items():
  prob = count / num_lemmas
  lemmas_info_dict[lemma] = -np.log(prob)
  if lemmas_info_dict[lemma] > maximum_lemma_info:
    maximum_lemma_info = lemmas_info_dict[lemma]

### Distances

In [None]:
######### Custom set longitude function #########
def custom_length(set1:set, use_info_content=False):
  """
  Computes length in set using lemmas information content if required.
  @param set1: set of elements
  @param use_info_content: If use the lemmas information content dictionary
  @return If set1 contains strings and use_info_content==True, the info content of the set. Otherwise, length of set1
  """
  res = len(set1)

  if res > 0:
    if use_info_content:
      res = 0
      for item in set1:
        if isinstance(item, str):
          res += lemmas_info_dict.get(item, maximum_lemma_info)
        else:
          res += maximum_lemma_info
  
  return res  

In [None]:
######### Custom distance functions #########
def jaccard_similarity(set1:set, set2:set, use_info_content=False):
  """
  Calculates the Jaccard similarity between two sets.
  @param set1: First set with sentence 1
  @param set2: Second set with sentence 2
  @param use_info_content: if True, information content is used to compute similarity
  @return: Jaccard similarity between the two sets
  """
  sim = 0
  len_func = partial(custom_length, use_info_content=use_info_content)

  denominator = (abs(len_func(set1.union(set2))))
  if denominator > 0:
    sim = abs(len_func(set1.intersection(set2))) / denominator

  return sim


def dice_similarity(set1:set, set2:set, use_info_content=False):
  """
  Computes Dice similarity between two sentences
  @param set1: set with first sentence
  @param set2: second sentence
  @param use_info_content: if True, information content is used to compute similarity
  @return: Dice similarity
  """
  sim = 0
  len_func = partial(custom_length, use_info_content=use_info_content)

  denominator =  abs(len_func(set1)) + abs(len_func(set2))
  if denominator > 0:
    sim = (2*abs(len_func(set1.intersection(set2)))) / denominator

  return sim


def overlap_similarity(set1:set, set2:set, use_info_content=False):
  """
  Computes Overlap similarity between two sentences
  @param set1: set with first sentence
  @param set2: set with second sentence
  @param use_info_content: if True, information content is used to compute similarity
  @return: Overlap similarity
  """
  sim = 0
  len_func = partial(custom_length, use_info_content=use_info_content)

  denominator = min(len_func(set1), len_func(set2))
  if denominator > 0:
    sim = (abs(len_func(set1.intersection(set2)))) / denominator
  return sim


def cosine_similarity(set1:set, set2:set, use_info_content=False):
  """
  Computes Cosine similarity between two sentences
  @param set1: set with first sentence
  @param set2: set with second sentence
  @param use_info_content: if True, information content is used to compute similarity
  @return: Cosine similarity
  """
  sim = 0
  len_func = partial(custom_length, use_info_content=use_info_content)

  denominator = np.sqrt(abs(len_func(set1)*abs(len_func(set2))))
  if denominator > 0:
    sim = abs(len_func(set1.intersection(set2))) / denominator  
  return sim


set_sim_functions = {'jaccard': jaccard_similarity , 'overlap': overlap_similarity,
                      'dice': dice_similarity, 'cosine': cosine_similarity}

### Auxiliar

In [None]:
# For n-grams similarity
def get_n_grams(sentence, n):
  """
  Get the n-grams of a sentence using a sliding window of size n and stride 1
  @param sentence: sentence
  @param n: size of the sliding window
  @return: list of n-grams
  """
  sentences = sentence.lower()
  n_grams_list = []
  ini = 0
  end = n
  while end < len(sentence):
    n_grams_list.append(sentence[ini:end])
    ini = end
    end += n

  return n_grams_list


def replace_synonyms(sentence1, sentence2):
  """
  @param sentence1: first sentence
  @param sentence2: second sentence
  @return two sentences with synonyms replaced
  """
  for word in sentence1:
    # obtain the synonims of the word
    synonyms = wordnet.synsets(word)
    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
    for lemma in lemmas:
      if lemma in sentence2:
        # replace lemma in sentece2
        sentence2[sentence2.index(lemma)] = word
  return sentence2


# For lemmas synonyms similarity
def match_synonyms(sentence1, sentence2):
  """
  Find synonyms in the two given sentences and replaces them  
  @param sentence1: first sentence
  @param sentence2: second sentence
  @return two sentences with synonyms matched and replaced
  """
  a  = replace_synonyms(sentence1, sentence2)
  if a == sentence2:
    b = replace_synonyms(sentence2, sentence1)
  if sentence1 != sentence2:
    sentence1 = b

  return sentence1, sentence2


# For regression methods
def get_sim_numpy_data(dataset, methods_dict):
  """
  Gets the similarity of a given dataset
  @param dataset: Dataframe for obtaining all base similarity measures 
  @return Numpy array with only base similarities results
  """
  columns_names = list(methods_dict.keys())
  return dataset[columns_names].to_numpy()

## Base similarities

In [None]:
######### Choose the set similarity function to use by default #########
default_sim_function = set_sim_functions["jaccard"]

#### Lexical

In [None]:
def nltk_words_sim(sentences):
  """
  Obtains the similarity between two sentences using nltk words technique
  @param senteces: tuple with two sentences
  @return: similarity between the two sentences using NLTK Words technique
  """
  sentences = [remove_special_chars(s) for s in sentences]
  words = [nltk.word_tokenize(s) for s in sentences]

  return default_sim_function(set(words[0]), set(words[1]))


def spacy_words_sim(sentences, use_preprocessing):
  """
  Obtains the similarity between two sentences using spacy words technique
  @param senteces: tuple with two sentences
  @param use_preprocessing: if True, preprocessing is used
  @return: similarity between the two sentences using NLTK Words technique
  """
  if use_preprocessing:
    sentences = [remove_special_chars(s) for s in sentences]
  tokens = [tokenize(s) for s in sentences]
  if use_preprocessing:
    tokens = [remove_stopwords(t) for t in tokens]
  
  words = [list(map(lambda x: x.text.lower(), t)) for t in tokens]

  return default_sim_function(set(words[0]), set(words[1]))


def word_synonyms_sim(sentences):
  """
  Computes the similarity between two sentences using word and synonyms technique
  @param senteces: tuple with two sentences
  @return: similarity between the two sentences using word and synonyms techniques
  """
  sentences = [remove_special_chars(s) for s in sentences]
  tokens = [tokenize(s) for s in sentences]
  words = [list(map(lambda x: x.text.lower(), t)) for t in tokens]

  words[0], words[1] = match_synonyms(words[0], words[1])

  return default_sim_function(set(words[0]), set(words[1]))


def nltk_lemmas_sim(sentences):
  """
  Computes the similarity between two sentences using nltk lemmas technique
  @param senteces: tuple with two sentences
  @return similarity between the two sentences using nltk lemmas technique
  """
  sent1, sent2 = sentences
  lemmas1, tags1 = get_lemmas(sent1)
  lemmas2, tags2 = get_lemmas(sent2)

  return default_sim_function(set(lemmas1), set(lemmas2), use_info_content=True)


def spacy_lemmas_sim(sentences):
  """
  Obtains the similarity between two sentences using Spacy lemmas technique
  @param senteces: tuple with two sentences
  @return: similarity between the two sentences using Spacy lemmas technique
  """
  sentences = [remove_special_chars(s) for s in sentences]
  tokens = [tokenize(s) for s in sentences]
  # Not stopwords removing
  
  lemmas = [list(map(lambda x: x.lemma_.lower(), t)) for t in tokens]

  return default_sim_function(set(lemmas[0]), set(lemmas[1]), use_info_content=True)


def lemmas_synonyms_sim(sentences):
  """
  Obtains the similarity between two sentences using lemmas and synonyms techniques
  @param senteces: tuple with two sentences
  @return: similarity between the two sentences using lemmas and synonyms techniques
  """
  sentences = [remove_special_chars(s) for s in sentences]
  tokens = [tokenize(s) for s in sentences]
  lemmas = [list(map(lambda x: x.lemma_.lower(), t)) for t in tokens]

  lemmas[0], lemmas[1] = match_synonyms(lemmas[0], lemmas[1])

  return default_sim_function(set(lemmas[0]), set(lemmas[1]))


def synset_sim(sentences):
  """
  Obtains the similarity between two sentences using synset technique
  @param senteces: tuple with two sentences
  @return similarity between the two sentences using synset technique
  """
  sent1, sent2 = sentences
  lemmas1, tags1 = get_lemmas(sent1)
  lemmas2, tags2 = get_lemmas(sent2)
  synsets1 = get_synsets(lemmas1, tags1)
  synsets2 = get_synsets(lemmas2, tags2)

  distance = 1
  if synsets1 or synsets2:
    distance = default_sim_function(set(synsets1), set(synsets2))
  
  return distance


def spacy_synset_sim(sentences):
  """
  Obtains the similarity between two sentences using Spacy synset technique
  @param senteces: tuple with two sentences
  @return similarity between the two sentences using Spacy synset technique
  """
  # No special chars removing
  docs = [nlp(s) for s in sentences]
  # Not stopwords removing
  
  synsets = []
  for sent_idx, sent_doc in enumerate(docs):
    # Get lemmas and tags
    lemmas = []
    tags = []
    for token in sent_doc:
      lemmas.append((token.lemma_.lower()))
      tags.append((token.lemma_.lower(), token.pos_))
    # Get synsets
    synsets.append(get_synsets(lemmas, tags))
  
  return default_sim_function(set(synsets[0]), set(synsets[1]))

#### Lexical + positional

In [None]:
def n_grams_sim(sentences, n):
  """
  Obtains the similarity between two sentences using n-grams technique
  @param senteces: tuple with two sentences
  @param n: n-grams size
  @return similarity between the two sentences using n-grams technique
  """
  sents_n_grams = [ [], [] ]
  for i, s in enumerate(sentences):
    sents_n_grams[i] += get_n_grams(s, n)
  
  return default_sim_function(set(sents_n_grams[0]), set(sents_n_grams[1]))


def n_lemmas_sim(sentences, n):
  """
  Obtains the similarity between two sentences using n-lemmas technique
  @param senteces: tuple with two sentences
  @param n: n-lemmas size
  @return similarity between the two sentences using n-lemmas technique
  """
  sentences = [remove_special_chars(s) for s in sentences]
  tokens = [tokenize(s) for s in sentences]
  lemmas = [list(map(lambda x: x.lemma_.lower(), t)) for t in tokens]

  n_lemmas_lists = [[], []]
  for sent_idx, sent_lemmas in enumerate(lemmas):
    ini = 0
    end = n 
    while end < len(sent_lemmas)-1:     
      n_lemmas = ""
      for i in range(ini, end):
        n_lemmas += sent_lemmas[i]+"_"
      n_lemmas_lists[sent_idx].append(n_lemmas)
      ini += 1
      end += 1
  
  return default_sim_function(set(n_lemmas_lists[0]), set(n_lemmas_lists[1]))

#### Compute base similarities

In [None]:
######### Define similarity methods with desired configurations #########
sim_methods_dict = {
    "NLTK words": nltk_words_sim,
    "Spacy words (without preprocessing)": partial(spacy_words_sim, use_preprocessing=False),
    "Word synonyms": word_synonyms_sim,
    "NLTK lemmas": nltk_lemmas_sim,
    "Spacy lemmas": spacy_lemmas_sim,
    "Lemmas synonyms": lemmas_synonyms_sim,
    "Synsets": synset_sim,
    "Ngrams2": partial(n_grams_sim, n=2),
    "Ngrams3": partial(n_grams_sim, n=3),
    "Ngrams4": partial(n_grams_sim, n=4),
    "Ngrams5": partial(n_grams_sim, n=5),
    "Ngrams6": partial(n_grams_sim, n=6),
    "Ngrams7": partial(n_grams_sim, n=7),
    "Nlemmas2": partial(n_lemmas_sim, n=2)
    }

In [None]:
######### Functions #########
def compute_all_similarities(dataset, sim_methods_dict):
  """
  Compute all similarities between pairs of sentences in the dataset
  @param dataset: dataframe caontaining sentence pairs
  @param sim_methods_dict: dictionary with similarity methods
  @action: compute all similarities between pairs of sentences in the dataset and add them to the dataset in the 'similarity' field
  """
  for method_name, method in sim_methods_dict.items():
    compute_similarities(dataset, method, method_name)


def compute_similarities(dataset, method, method_name):
  """
  Compute similarity between pairs of sentences in the dataset
  @param dataset: dataset to compute the similarity
  @param method: similarity method
  @param method_name: name of the similarity method
  @action: save the similarity matrix in a new column in dataset. 
  """
  dataset[method_name] = list(map(method, zip(dataset[0], dataset[1])))

In [None]:
######### Compute #########
compute_all_similarities(train_dataset, sim_methods_dict)
compute_all_similarities(test_dataset, sim_methods_dict)

## Regression

In [None]:
######### Define regression methods with desired configurations #########
regr_methods_dict = {"Linear": linear_model.LinearRegression,
                      "MLP": partial(MLPRegressor, hidden_layer_sizes=(200, 100, 50), max_iter=1000, random_state=42),
                    }

In [None]:
######### Functions #########
def train_regression(train_dataset, regr_model, sim_methods_dict):
  """
  Train the regression model
  @param train_dataset: dataset to train the regression model
  @param regr_model: regression model
  @param sim_methods_dict: dictionary with similarity methods
  @action train the regression model
  """
  # Get data
  train_data = get_sim_numpy_data(train_dataset, sim_methods_dict)
  train_labels = train_dataset["gs"].to_numpy()

  # Train
  regr_model.fit(train_data, train_labels)


def regression_predict(dataset, regr_model, sim_methods_dict, regression_method_name):
  """
  Predict the regression model
  @param dataset: dataset to predict the regression model
  @param regr_model: regression model
  @param sim_methods_dict: dictionary with similarity methods
  @param regression_method_name: name of the regression method
  @action predict the regression model and add it to the dataset
  """
  data = get_sim_numpy_data(dataset, sim_methods_dict)

  predictions = regr_model.predict(data)
  predictions = np.tanh(predictions)  # Apply hyperbolic tangent function
  predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min()) # Min-Max normalization for more intuitive results

  dataset[regression_method_name+" regression"] = predictions

In [None]:
######### Perform regression #########
regr_models_dict = {}
for regr_method_name, method in regr_methods_dict.items():
  print(f"Performing {regr_method_name} regression...")

  regr_models_dict[regr_method_name] = regr_model = method() # Create model
  train_regression(train_dataset, regr_model, sim_methods_dict)
  
  # If Linear, show relevances and weights
  if regr_method_name == "Linear":
    print("Base similarities relevances and weights in linear regression:")
    norm_weights = abs(regr_model.coef_) / np.sum(abs(regr_model.coef_)) * len(regr_model.coef_)
    for i, sim_method_name in enumerate(sim_methods_dict.keys()):
      print(f"[{sim_method_name}] = {norm_weights[i]} ({regr_model.coef_[i]})")

  # Predict
  regression_predict(train_dataset, regr_model, sim_methods_dict, regr_method_name)
  regression_predict(test_dataset, regr_model, sim_methods_dict, regr_method_name)

Performing Linear regression...
Base similarities relevances and weights in linear regression:
[NLTK words] = 1.4959906842709358 (0.5891151401430089)
[Spacy words (without preprocessing)] = 2.9200640861527227 (-1.1499095425041275)
[Word synonyms] = 2.105794755059739 (0.8292535410031717)
[NLTK lemmas] = 1.576660510393672 (-0.6208825945939491)
[Spacy lemmas] = 2.8179972547127026 (1.1097160330525337)
[Lemmas synonyms] = 0.3108249955700553 (0.12240163842627261)
[Synsets] = 0.1059550020730842 (-0.041724655475082556)
[Ngrams2] = 0.5800577323739827 (0.2284244119241096)
[Ngrams3] = 0.470425449954287 (0.1852516581757447)
[Ngrams4] = 0.1488616205535068 (-0.05862110999510209)
[Ngrams5] = 0.011718479635620527 (0.004614690348934886)
[Ngrams6] = 0.6016542154652748 (-0.23692902047327513)
[Ngrams7] = 0.08536966113411375 (0.0336182306559579)
[Nlemmas2] = 0.7686255526503017 (-0.30268166434990507)
Performing MLP regression...


## Evaluation

In [None]:
######### Functions #########
def evaluate(dataset):
  """
  Obtain and show the correlation between similarity measures and gold standard for a given dataset and print the results.
  @param dataset: Dataframe with similarity measures and gold standard.
  @return correlations dictionary for each similarity measure
  """
  correlations = compute_correlations(dataset)
  
  for method_name, correlation in correlations.items():
    print(f"[ Gold Standard <-> {method_name} ] correlation = {round(correlation,4)}")
  
  return correlations


def compute_correlations(dataset):
  """
  Compute correlations for a given dataset
  @param dataset: Dataframe with similarity measures and gold standard.
  @return correlations dictionary for each similarity measure
  """
  correlations = {}
  methods_names = list(filter(lambda x: x not in [0, 1, "gs"], dataset.columns))
  for method_name in methods_names:
    correlations[method_name] = pearsonr(dataset["gs"], dataset[method_name])[0]
  
  return correlations

In [None]:
######### Evaluate #########
print("Train results:")
correlations = evaluate(train_dataset)
print("Test results:")
correlations = evaluate(test_dataset)

Train results:
[ Gold Standard <-> NLTK words ] correlation = 0.4396
[ Gold Standard <-> Spacy words (without preprocessing) ] correlation = 0.4167
[ Gold Standard <-> Word synonyms ] correlation = 0.5526
[ Gold Standard <-> NLTK lemmas ] correlation = 0.351
[ Gold Standard <-> Spacy lemmas ] correlation = 0.6356
[ Gold Standard <-> Lemmas synonyms ] correlation = 0.5714
[ Gold Standard <-> Synsets ] correlation = 0.3929
[ Gold Standard <-> Ngrams2 ] correlation = 0.3078
[ Gold Standard <-> Ngrams3 ] correlation = 0.1496
[ Gold Standard <-> Ngrams4 ] correlation = 0.1161
[ Gold Standard <-> Ngrams5 ] correlation = 0.0745
[ Gold Standard <-> Ngrams6 ] correlation = 0.0569
[ Gold Standard <-> Ngrams7 ] correlation = 0.0402
[ Gold Standard <-> Nlemmas2 ] correlation = 0.1519
[ Gold Standard <-> Linear regression ] correlation = 0.7756
[ Gold Standard <-> MLP regression ] correlation = 0.8495
Test results:
[ Gold Standard <-> NLTK words ] correlation = 0.4203
[ Gold Standard <-> Spacy word

## Interactive test

In [None]:
# Set sentences
sentence1 =   "cats are jolly"#@param {type:"string"}
sentence2 = "cats are pretty" #@param {type:"string"}
sentences = (sentence1, sentence2)

# Get fundamental similarities
similarities = {}
for sim_method_name, sim_method in sim_methods_dict.items():
  similarities[sim_method_name] = sim_method(sentences)
fund_sims = np.array([list(similarities.values())])

# Perform regression
for regr_model_name, regr_model in regr_models_dict.items():
  pred = regr_model.predict(fund_sims)[0]
  pred = max(min(pred, 1), 0) # Limit in range [0, 1]
  similarities[regr_model_name] = pred

# Print results
print("Similarities:")
for name, value in similarities.items():
  print(f"[{name}] = {round(value,4)}")

Similarities:
[NLTK words] = 0.5
[Spacy words (without preprocessing)] = 0.5
[Word synonyms] = 1.0
[NLTK lemmas] = 0.5
[Spacy lemmas] = 0.3209
[Lemmas synonyms] = 1.0
[Synsets] = 1.0
[Ngrams2] = 0.5
[Ngrams3] = 0.6
[Ngrams4] = 0.5
[Ngrams5] = 0.3333
[Ngrams6] = 0.3333
[Ngrams7] = 0.5
[Nlemmas2] = 0
[Linear] = 1
[MLP] = 1
