## Students:
1.   João Valério
2.   Eirik Grytøyr

In [1]:
# Getting the file STS.input.SMTeuroparl.txt from drive into a DataFrame
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
dt = pd.read_csv('/content/drive/MyDrive/data/ihlt/test-gold/STS.input.SMTeuroparl.txt',sep='\t',header=None)

Mounted at /content/drive


In [2]:
# Updating the DataFrame with a new column with STS.gs.SMTeuroparl.txt
dt['gs'] = pd.read_csv('/content/drive/MyDrive/data/ihlt/test-gold/STS.gs.SMTeuroparl.txt',sep='\t',header=None)

In [3]:
import nltk
import re

# Getting a list of stop words
nltk.download('stopwords')
stopWordSet = set(nltk.corpus.stopwords.words('english'))

def cleaner (sentenceList):

  # Get the list into lowercase
  sentenceList = list(map(lambda word: word.lower(), sentenceList))
  
  # Filtering the ponctuation and the stop words
  sentenceList = list(filter(lambda word : re.search('''[!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~]+''', word) == None and word not in stopWordSet, sentenceList))
  
  return sentenceList

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
nltk.download('omw-1.4')
nltk.download('wordnet')
wnl = nltk.stem.WordNetLemmatizer()

# Mapping the tags between Treebank and WordNet
tag_map = {
  'CC':"none", # coordin. conjunction (and, but, or)  
  'CD':"n", # cardinal number (one, two)             
  'DT':"none", # determiner (a, the)                    
  'EX':"r", # existential ‘there’ (there)           
  'FW':"none", # foreign word (mea culpa)             
  'IN':"r", # preposition/sub-conj (of, in, by)   
  'JJ':"a", # adjective (yellow)                  
  'JJR':"a", # adj., comparative (bigger)          
  'JJS':"a", # adj., superlative (wildest)           
  'LS':"none", # list item marker (1, 2, One)          
  'MD':"none", # modal (can, should)                    
  'NN':"n", # noun, sing. or mass (llama)          
  'NNS':"n", # noun, plural (llamas)                  
  'NNP':"n", # proper noun, sing. (IBM)              
  'NNPS':"n", # proper noun, plural (Carolinas)
  'PDT':"a", # predeterminer (all, both)            
  'POS':"none", # possessive ending (’s )               
  'PRP':"none", # personal pronoun (I, you, he)     
  'PRP$':"none", # possessive pronoun (your, one’s)    
  'RB':"r", # adverb (quickly, never)            
  'RBR':"r", # adverb, comparative (faster)        
  'RBS':"r", # adverb, superlative (fastest)     
  'RP':"a", # particle (up, off)
  'SYM':"none", # symbol (+,%, &)
  'TO':"none", # “to” (to)
  'UH':"none", # interjection (ah, oops)
  'VB':"v", # verb base form (eat)
  'VBD':"v", # verb past tense (ate)
  'VBG':"v", # verb gerund (eating)
  'VBN':"v", # verb past participle (eaten)
  'VBP':"v", # verb non-3sg pres (eat)
  'VBZ':"v", # verb 3sg pres (eats)
  'WDT':"none", # wh-determiner (which, that)
  'WP':"none", # wh-pronoun (what, who)
  'WP$':"none", # possessive (wh- whose)
  'WRB':"none", # wh-adverb (how, where)
}

# Lemmatizing the words according to the tag_map
def lemmatize(p): 
  wn_tag = tag_map[p[1]]
  if wn_tag != "none":
    return wnl.lemmatize(p[0], pos=wn_tag)
  return p[0]

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
from nltk.metrics import jaccard_distance
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Lists to save all the lemmatized words
text1 = []
text2 = []

# Adding an empty column to the DataFrame
dt['jaccard'] = ''

limit = len(dt[0][:])

for id in range(limit):

  # Tokenization of the 2 texts
  tokensText1 = cleaner(nltk.word_tokenize(dt.loc[id,0]))
  tokensText2 = cleaner(nltk.word_tokenize(dt.loc[id,1]))

  # List of lematized words according to the tags associated
  text1.append([lemmatize(pair) for pair in nltk.pos_tag(tokensText1)])
  text2.append([lemmatize(pair) for pair in nltk.pos_tag(tokensText2)])

  # Updating the DataFrame with the similarities according to the method jaccard 
  dt.loc[id,'jaccard'] = jaccard_distance(set(text1[id]), set(text2[id]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [6]:
# Additional code to compare gs and jaccard in the same scale
'''
for id in range(limit):

  # Difference between jaccard and gs on the same scale
  diff = float(dt.loc[id,'jaccard']) - abs((float(dt.loc[id,'gs']) / 5 - 1))
  
  # The minDiff variable defines the minimum difference we are looking for
  minDiff = 0.8
  if (diff > minDiff):
    print('id:', id)
    print('jaccard:', float(dt.loc[id,'jaccard']))
    print('gs:', dt.loc[id,'gs'])
    print('Difference in the same scale:', diff)
    print('1. Initial phrase:', dt.loc[id,0])
    print('1. Tokenized phrase:', text1[id])
    print('2. Initial phrase:', dt.loc[id,1])
    print('2. Tokenized phrase:', text2[id], '\n\n')
'''

# Additional code to compare jaccard measurement with or without lemmatizer
'''
# Lists to save all the lemmatized words
text1 = []
text2 = []

# Adding an empty column to the DataFrame
dt['jaccard_No_Lemmatizer'] = ''

# Counter of times where lemmatizer gives worse results
counter = 0

limit = len(dt[0][:])

for id in range(limit):

  # Tokenization of the 2 texts
  tokensText1 = cleaner(nltk.word_tokenize(dt.loc[id,0]))
  tokensText2 = cleaner(nltk.word_tokenize(dt.loc[id,1]))

  # List of lematized words according to the tags associated
  text1.append(tokensText1)
  text2.append(tokensText2)

  # Updating the DataFrame with the similarities according to the method jaccard 
  dt.loc[id,'jaccard_No_Lemmatizer'] = jaccard_distance(set(text1[id]), set(text2[id]))

  if (dt.loc[id,'jaccard'] > dt.loc[id,'jaccard_No_Lemmatizer']):
    print('id:', id)
    print('jaccard with lemmatizer:', float(dt.loc[id,'jaccard']))
    print('jaccard without lemmatizer:', float(dt.loc[id,'jaccard_No_Lemmatizer']))
    print('gs:', float(dt.loc[id,'gs']))
    print('Difference:', abs(float(dt.loc[id,'jaccard']) - float(dt.loc[id,'jaccard_No_Lemmatizer'])))
    print('1. Initial phrase:', dt.loc[id,0])
    print('1. Tokenized phrase:', text1[id])
    print('1. Lemmatized phrase:', nltk.pos_tag(text1[id]))
    print('2. Initial phrase:', dt.loc[id,1])
    print('2. Tokenized phrase:', text2[id])
    print('2. Lemmatized phrase:', nltk.pos_tag(text2[id]), '\n\n')
    counter = counter + 1
print('The lemmatizer gives worse results', counter, 'times -', int((counter / limit) * 100), '% of the data.')
'''

"\n# Lists to save all the lemmatized words\ntext1 = []\ntext2 = []\n\n# Adding an empty column to the DataFrame\ndt['jaccard_No_Lemmatizer'] = ''\n\n# Counter of times where lemmatizer gives worse results\ncounter = 0\n\nlimit = len(dt[0][:])\n\nfor id in range(limit):\n\n  # Tokenization of the 2 texts\n  tokensText1 = cleaner(nltk.word_tokenize(dt.loc[id,0]))\n  tokensText2 = cleaner(nltk.word_tokenize(dt.loc[id,1]))\n\n  # List of lematized words according to the tags associated\n  text1.append(tokensText1)\n  text2.append(tokensText2)\n\n  # Updating the DataFrame with the similarities according to the method jaccard \n  dt.loc[id,'jaccard_No_Lemmatizer'] = jaccard_distance(set(text1[id]), set(text2[id]))\n\n  if (dt.loc[id,'jaccard'] > dt.loc[id,'jaccard_No_Lemmatizer']):\n    print('id:', id)\n    print('jaccard with lemmatizer:', float(dt.loc[id,'jaccard']))\n    print('jaccard without lemmatizer:', float(dt.loc[id,'jaccard_No_Lemmatizer']))\n    print('gs:', float(dt.loc[id,'g

In [7]:
display(dt)

Unnamed: 0,0,1,gs,jaccard
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.500,0.692308
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.000,0.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.250,0.727273
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.500,0.25
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.000,0.0
...,...,...,...,...
454,It is our job to continue to support Latvia wi...,It is of our duty of continue to support the c...,5.000,0.636364
455,The vote will take place today at 5.30 p.m.,Vote will take place at 17 h 30.,4.750,0.571429
456,Neither was there a qualified majority within ...,There was no qualified majority in this Parlia...,5.000,0.636364
457,Let me remind you that our allies include ferv...,"I hold you recall that our allies, there are e...",4.000,0.8


In [8]:
from scipy.stats import pearsonr

# Get the correlation and the p-value between gs and jaccard
corr, p = pearsonr(dt['gs'], dt['jaccard'])
print("Correlation coefficient:", corr)
print("p-value:", p)

Correlation coefficient: -0.49079034875530486
p-value: 3.384400190747036e-29


In this updated version of the code, with the Penn Treebank Tag and the Wordnet's lemmatizer implementation, the correlation coefficient measured by the Pearson method is -0.49, with a p-value of 3.38e-29, a negative non-linear correlation between the gold standard and Jaccard methods. Once again, it's important to note that even though the p-value is diminished, meaning that the null hypothesis is false and there is a correlation between the variables, the amount of data is insufficient to make such a conclusion.

Comparing the value obtained with -0.48 (corresponding to the implementation without the lemmatizer), it registered an improvement of 1 percentage point in the correlation, reflecting a low upgrade of the model.

The Wordnet's lemmatizer produces this impact, by transforming the words into their basic form (lemmas), according to the characterization (tag) given by the Penn Treebank Tagger. Thus, the similarity method measures phrases, constituted by lemmas instead of words, in which the criteria are more reliable since different words may have the same lemma and, consequentially, equivalent meanings too. Thereby, using lemmas rather than words produces a better approach to the similarity measure.

It's essential to point out that as Wordnet's lemmatizer and Penn Treebank Tagger have distinct tags, it was created a dictionary (tag_map) in order to establish the proper correspondences. Particularly, all the types of adjectives provided by the Penn Treebank Tagger were considered as adjectives (the tag 'a' in Wordnet's lemmatizer) since no difference was produced by selecting 'a' or 's'. According to that and a few more tests, it's feasible to conclude that even though Wordnet's lemmatizer differentiates between adjectives (tag 'a') and adjective satellite (tag 's'), it treats the lemmatization of the word identically. 

To finish, even though the characterization of the words followed by the lemmatizer produces a general improvement (1 percentage point in the correlation), in 5% of the data (24 samples) the similarity decreased. In all of these cases, the gold standard classification is at least 4.50, indicating a very high correlation between sentences. However, in these examples, usually, the order of the words in the second phrase is different, implying a distinguished tag attribution to the same word, leading to non-matching lemmas. As Jaccard compares string in a literal manner, it outputs a better score to identical words than to dissimilar lemmas.