
#Lab3 : Morphology

Authors:<br>
* Ramón Mateo Navarro
* Benet Manzanares Salor

##Installation ans imports

In [None]:
import os
import pandas as pd
import nltk

from argparse import Namespace
from google.colab import drive
from scipy.stats import pearsonr
from nltk.metrics import jaccard_distance
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Settings


In [None]:
settings = Namespace()

settings.mount_path = "/content/drive/"
drive.mount(settings.mount_path, force_remount=True)

settings.project_folder = "Benet_MAI/S1/IHLT/IHLT_Labs/Lab3" #@param {type:"string"}
settings.project_path = os.path.join(os.path.join(settings.mount_path, "MyDrive"), settings.project_folder)
settings.input_filename = "STS.input.SMTeuroparl.txt" #@param {type:"string"}
settings.input_filepath = os.path.join(settings.project_path, settings.input_filename)
settings.gs_filename = "STS.gs.SMTeuroparl.txt" #@param {type:"string"}
settings.gs_filepath = os.path.join(settings.project_path, settings.gs_filename)

Mounted at /content/drive/


## Data

In [None]:
dt = pd.read_csv(settings.input_filepath,sep='\t', header=None)
dt['gs'] = pd.read_csv(settings.gs_filepath, sep='\t', header=None)
dt.head()

Unnamed: 0,0,1,gs
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0


## Experiment

In [None]:
wnl = WordNetLemmatizer()

* Jaccard distance 

In [None]:
def words_jaccard_similarity(sentences):
  sent1, sent2 = sentences
  words1 = set(nltk.word_tokenize(sent1))
  words2 = set(nltk.word_tokenize(sent2))
  return 1-jaccard_distance(words1, words2)


def lemmatize(p):
    if p[1][0] in {'N', 'V', 'JJ', 'VB'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]


def get_lemmas(sentence):
  words = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(words)
  lemmas = [lemmatize(pair) for pair in tags]
  return set(lemmas), tags


def lemmas_jaccard_similarity(sentences):
  sent1, sent2 = sentences
  lemmas1, tag1 = get_lemmas(sent1)
  lemmas2, tag2 = get_lemmas(sent2)

  return 1 - jaccard_distance(lemmas1, lemmas2)
  

dt["words_jaccard"] = list(map(words_jaccard_similarity, zip(dt[0], dt[1])))
dt["lemmas_jaccard"] = list(map(lemmas_jaccard_similarity, zip(dt[0], dt[1])))

# Evaluation of the results
In this section we evaluate if, using jaccard similarity, lemmatization offers a better performance than only word tokenizing. To this end, we will show the correlation with the gold standard and the sentences where one approach overcome the other.

The following questions will be answered:
* **Which is better: words or lemmas?** <br>
* **Do you think that could perform better for any pair of texts?** <br>

## Which is better: words or lemmas?

In [None]:
gs_words_correlation = pearsonr(dt['gs'], dt['words_jaccard'])[0]
gs_lemmas_correlation = pearsonr(dt['gs'], dt['lemmas_jaccard'])[0]

print(f"[ Gold Standard <-> Words Jaccard ] correlation = {gs_words_correlation}")
print(f"[ Gold Standard <-> Lemmas Jaccard ] correlation = {gs_lemmas_correlation}")
best_approach = "Lemmatization" if abs(gs_lemmas_correlation) > abs(gs_words_correlation) else "Words"
print(f"{best_approach} offers a better correlation with the Gold Standard")

[ Gold Standard <-> Words Jaccard ] correlation = 0.4504977169318684
[ Gold Standard <-> Lemmas Jaccard ] correlation = 0.4569107458417673
Lemmatization offers a better correlation with the Gold Standard


* Get sentence pairs where one approach is better than the other and viceversa

In [None]:
words_better_than_lemmas_df = dt[abs(dt.gs - dt.words_jaccard * 5) < abs(dt.gs - dt.lemmas_jaccard * 5)]
lemmas_better_than_words_df = dt[abs(dt.gs - dt.lemmas_jaccard * 5) < abs(dt.gs - dt.words_jaccard * 5)]
  
print(f"#Sentences where the words approach is better = {len(words_better_than_lemmas_df)}")
print(f"#Sentences where the lemmatization approach is better = {len(lemmas_better_than_words_df)}")
print(f"#Sentences where both approaches are equal = {len(dt)-(len(words_better_than_lemmas_df)+len(lemmas_better_than_words_df))}")

#Sentences where the words approach is better = 15
#Sentences where the lemmatization approach is better = 125
#Sentences where both approaches are equal = 319


* **Which is better: words or lemmas?** <br>
As it has a greater correlation with the Gold Standard and there are more sentences pairs where it produces a better similarity, we conclude that the lemmatization approach is better.

## Do you think that could perform better for any pair of texts?

* Showing the sentences where the words approach is better than the lemmatization approach

In [None]:
def print_sentence_pair(row):
  print(f"[ GS = {row['gs']} | Words jaccard = {round(row['words_jaccard'], 3)} | Lemmas jaccard = {round(row['lemmas_jaccard'], 3)} ]")
  print(f"S1 = {row[0]}")
  print(f"S2 = {row[1]}")
  lemmas1, tags1 = get_lemmas(row[0])
  lemmas2, tags2 = get_lemmas(row[1])
  print(f"Lemmas S1 = {lemmas1}")
  print(f"Lemmas S2 = {lemmas2}")
  print(f"Tags S1 = {tags1}")
  print(f"Tags S2 = {tags2}\n")


for index, row in words_better_than_lemmas_df.iterrows():
  print_sentence_pair(row)

[ GS = 5.0 | Words jaccard = 0.833 | Lemmas jaccard = 0.571 ]
S1 = Amendment No 7 proposes certain changes in the references to paragraphs.
S2 = Amendment No 7 proposes changes to certain paragraphs references.
Lemmas S1 = {'paragraph', 'change', 'to', 'propose', 'in', 'the', '.', 'certain', '7', 'amendment', 'reference', 'no'}
Lemmas S2 = {'change', 'to', '.', 'certain', '7', 'amendment', 'proposes', 'paragraphs', 'reference', 'no'}
Tags S1 = [('Amendment', 'NNP'), ('No', 'NNP'), ('7', 'CD'), ('proposes', 'VBZ'), ('certain', 'JJ'), ('changes', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('references', 'NNS'), ('to', 'TO'), ('paragraphs', 'VB'), ('.', '.')]
Tags S2 = [('Amendment', 'NNP'), ('No', 'NNP'), ('7', 'CD'), ('proposes', 'NNS'), ('changes', 'NNS'), ('to', 'TO'), ('certain', 'JJ'), ('paragraphs', 'JJ'), ('references', 'NNS'), ('.', '.')]

[ GS = 4.25 | Words jaccard = 0.667 | Lemmas jaccard = 0.429 ]
S1 = Maij-Weggen report (A5-0323/2000)
S2 = Relation Maij-Weggen (A5-0323/2000)
Lemma

* Examples of lemmatization-based similairty fails

In [None]:
print_sentence_pair(words_better_than_lemmas_df.iloc[0])

[ GS = 5.0 | Words jaccard = 0.833 | Lemmas jaccard = 0.571 ]
S1 = Amendment No 7 proposes certain changes in the references to paragraphs.
S2 = Amendment No 7 proposes changes to certain paragraphs references.
Lemmas S1 = {'paragraph', 'change', 'to', 'propose', 'in', 'the', '.', 'certain', '7', 'amendment', 'reference', 'no'}
Lemmas S2 = {'change', 'to', '.', 'certain', '7', 'amendment', 'proposes', 'paragraphs', 'reference', 'no'}
Tags S1 = [('Amendment', 'NNP'), ('No', 'NNP'), ('7', 'CD'), ('proposes', 'VBZ'), ('certain', 'JJ'), ('changes', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('references', 'NNS'), ('to', 'TO'), ('paragraphs', 'VB'), ('.', '.')]
Tags S2 = [('Amendment', 'NNP'), ('No', 'NNP'), ('7', 'CD'), ('proposes', 'NNS'), ('changes', 'NNS'), ('to', 'TO'), ('certain', 'JJ'), ('paragraphs', 'JJ'), ('references', 'NNS'), ('.', '.')]



At the previous sentences pair, the lemmatization problem is caused by the word "paragraphs". <br>
At the first sentence the word is classified as VB(verb), so it is lemmatizated as "paragraph". <br>
On the other hand, at the second sentence it is classified as JJ(adjective), so is lemmatizated as "paragraphs". <br>
Subsequently, the jaccard distance is lower than the for the words approach, where both "paragraphs" instances are considered equals.


In [None]:
print_sentence_pair(words_better_than_lemmas_df.iloc[1])

[ GS = 4.25 | Words jaccard = 0.667 | Lemmas jaccard = 0.429 ]
S1 = Maij-Weggen report (A5-0323/2000)
S2 = Relation Maij-Weggen (A5-0323/2000)
Lemmas S1 = {'report', 'a5-0323/2000', ')', 'Maij-Weggen', '('}
Lemmas S2 = {'maij-weggen', 'a5-0323/2000', ')', 'relation', '('}
Tags S1 = [('Maij-Weggen', 'JJ'), ('report', 'NN'), ('(', '('), ('A5-0323/2000', 'NNP'), (')', ')')]
Tags S2 = [('Relation', 'NN'), ('Maij-Weggen', 'NNP'), ('(', '('), ('A5-0323/2000', 'NNP'), (')', ')')]



Equivalently to the first example, the lemmatization problem is caused by the word "Maij-Weggen". <br>
At the first sentence the word is classified as JJ(adjective), so is lemmatizated as "Maij-Weggen". <br>
On the other hand, at the second sentence it is classified as NNP(proper noun), so it is incorrectly lemmatizated as "maij-weggen" (lower case). <br>
Subsequently, the jaccard distance is lower than the for the words approach, where both "Maij-Weggen" instances are considered equals.

* **Do you think that could perform better for any pair of texts?** <br>
As has been seen, the lemmatization problem for this dataset/corpus is caused by incorrect Part-Of-Speech classifications and consequent lemmatization errors. Nevertheless, this is only a noticeable disadavantadge compared to only tokenizing words if sentence similarity is commonly related to high vocabulary sharing (many identical words). Consequently, for other datasets/corpora with equal or less vocabulary sharing (less identical words between sentence pairs), lemmatization should obtain a better performance.