In [None]:
import pandas as pd
import io
import numpy as np
import os
from zipfile import ZipFile
import warnings

!pip install fuzzywuzzy
import fuzzywuzzy

from fuzzywuzzy import fuzz
import re

warnings.filterwarnings("ignore")
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import csv

In [None]:
!wget http://www.dianamccarthy.co.uk/downloads/WordMeaningAnno2012/cl-meaningincontext.tgz

In [None]:
!tar -xzvf "/content/cl-meaningincontext.tgz" -C "/content/"


In [5]:
from html.parser import HTMLParser

In [6]:
class MyHTMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.lemma2id2data = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: None)))
        self.lemma = None
        self.id_ = None
        self.tag = None
        self.endtag = None
        self.sentence = None
        self.preceding = None
        self.following = None
        self.target_id = None
    
    def handle_starttag(self, tag, attrs):
        #print("Encountered a start tag:", tag, attrs)
        if tag == 'lexelt':
            self.lemma = attrs[0][1]
        if tag == 'instance':
            self.id_ = attrs[0][1]
        if tag == 'context':
            self.sentence = ''
        self.tag = tag

    def handle_endtag(self, endtag):
        #print("Encountered an end tag :", endtag)
        if endtag == 'instance':
            lemma = self.lemma
            id_ = self.id_
            preceding = self.preceding if self.preceding!=None else ' '
            following = self.following if self.following!=None else ' '
            self.lemma2id2data[lemma][id_]['sentence'] = self.sentence.replace('\n','').replace('\t','').replace('    ','')
            self.lemma2id2data[lemma][id_]['preceding'] = preceding.replace('\n','').replace('\t','').replace('    ','')
            self.lemma2id2data[lemma][id_]['following'] = following.replace('\n','').replace('\t','').replace('    ','')
            self.lemma2id2data[lemma][id_]['target_id'] = self.target_id        
        self.endtag = endtag

    def handle_data(self, data):
        #print("Encountered some data  :", data, self.tag, self.endtag)
        if (self.tag == 'context' and (self.endtag == 'instance' or self.endtag == 'lexelt' or self.endtag == None)) or (self.tag == 'head' and self.endtag == 'head'):
            self.sentence += data
        if self.tag == 'head' and self.endtag == 'context':
            self.following = data
        if self.tag == 'head' and (self.endtag == 'instance' or self.endtag == 'lexelt' or self.endtag == None):
            self.sentence += data
            self.sentence = self.sentence.replace('\n','').replace('\t','').replace('    ','')
            sentence_split = self.sentence.split()
            self.target_id = len(sentence_split)-1
        if self.tag == 'wcontext' and (self.endtag == 'instance' or self.endtag == 'lexelt' or self.endtag == None):
            self.preceding = data

In [7]:
from collections import defaultdict
import xml.etree.ElementTree as ET
with open('/content/Data/lexsub_wcdata.xml', encoding='iso-8859-1') as xmlfile:
    data = xmlfile.read()
    parser = MyHTMLParser()
    parser.feed(data)
    lemma2id2data = parser.lemma2id2data
    

In [8]:
os.makedirs( '/content/wssim/', exist_ok=True )

In [9]:
lemma2id2context = defaultdict(lambda: defaultdict(lambda: None))
for lemma, id2data in lemma2id2data.items():
    lemma, pos = lemma.split('.')    
    for id_, data in id2data.items():
        identifier = lemma+'-'+id_
        #print(identifier)
        grouping = '1'
        preceding = data['preceding'].strip(' ')
        sentence = data['sentence']
        following = data['following'].strip(' ')
        leading_spaces = len(sentence) - len(sentence.lstrip(' '))
        index = int(data['target_id']) - leading_spaces
        sentence = sentence.strip(' ')
        context = preceding + ' ' + sentence + ' ' + following
        index = len(preceding.split()) + index
        index_sentence = str(len(preceding.split()))+':'+str(len(preceding.split())+len(sentence.split()))
        indd = str(len(preceding)) + ':' + str(len(preceding)+len(sentence)+1)
        context = {'lemma':lemma, 'pos':pos, 'date': ' ', 'grouping':grouping, 'identifier':identifier, 'description':' ', 'context': context, 'indexes_target_token':' ', 'indexes_target_sentence':indd, 'context_tokenized':context, 'indexes_target_token_tokenized':index, 'indexes_target_sentence_tokenized':index_sentence}  
        
        lemma2id2context[lemma][lemma+'-'+id_] = context
        
with open('/content/Markup/WordSenseSimilarity/wssim2ratings.csv', encoding='utf-8') as csvfile: 
    reader = csv.DictReader(csvfile, delimiter=',',quoting=csv.QUOTE_NONE,strict=True)
    table = [row for row in reader]

lemma2data = defaultdict(lambda: [])
for row in table:    
    lemma, pos = row['lemma'].split('.')
    id1 = row['lexsub_id']
    id2 = row['sense_id']
    comment = ' '
    judgment = row['judgment']
    annotator = row['user_id']
    if annotator == 'avg':
        continue
    data = {'identifier':lemma+'-'+id1,'sense_id': id2,'annotator':annotator,'judgment':float(judgment),'comment':comment,'lemma':lemma}    
    lemma2data[lemma].append(data)
    
for lemma in lemma2data:
    output_folder = 'wssim'+'/' +lemma+'/'    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)    

    # Export data
    with open(output_folder +'judgments.csv', 'w') as f:  
        w = csv.DictWriter(f, lemma2data[lemma][0].keys(), delimiter='\t', quoting = csv.QUOTE_NONE, quotechar='')
        w.writeheader()
        w.writerows(lemma2data[lemma])

    contexts = list(lemma2id2context[lemma].values())
    
    # Export data
    with open(output_folder +'uses.csv', 'w') as f:  
        w = csv.DictWriter(f, contexts[0].keys(), delimiter='\t', quoting = csv.QUOTE_NONE, quotechar='')
        w.writeheader()
        w.writerows(contexts)
        
        

In [10]:
dirs = os.listdir('wssim')

In [11]:
path_u = []
for i in dirs:
  path_u.append("wssim/" + i + "/uses.csv")

In [12]:
path_j = []
for i in dirs:
  path_j.append("wssim/" + i + "/judgments.csv")

In [13]:
wssim_uses_df = pd.DataFrame()            #wssim uses df
for i in path_u:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[0]
   wssim_uses_df = pd.concat([wssim_uses_df, Tmp])

In [14]:
wssim_uses_df = wssim_uses_df.reset_index(drop = True)

In [16]:
def get_indice(sent, indice):
  tag = ''
  indice = int(indice)
  tok = sent.split(" ")
  #print(tok)
  for i in range(len(tok)):
    if i == indice:
      tag = tok[i]
  return str(sent.find(tag))+":"+str(sent.find(tag)+len(tag))

In [17]:
wssim_uses_df["indexes_target_token"] = wssim_uses_df.apply(lambda x: get_indice(x.context_tokenized, x.indexes_target_token_tokenized), axis=1)

In [27]:
uses = wssim_uses_df

In [None]:
!git clone https://github.com/Garrafao/WUGs.git

In [24]:
os.makedirs('/content/WUGs/scripts/misc/data/dwug_en/data/*/', exist_ok=True )

In [29]:
import csv
uses.to_csv('/content/WUGs/scripts/misc/data/dwug_en/data/*/uses.csv', sep='\t', encoding='utf-8', quoting = csv.QUOTE_NONE, quotechar='')

In [35]:
output_file = '/content/WUGs/scripts/misc/data/dwug_en/data/'

In [None]:
%run /content/WUGs/scripts/misc/use2normalize.py /content/WUGs/scripts/misc/data/dwug_en/data/*/uses.csv dwug_en output_file

In [37]:
final_uses = pd.read_csv('/content/output_file', delimiter = '\t',quoting =3)

In [47]:
final_uses['context_tokenized'] = wssim_uses_df['context_tokenized']
final_uses['pos'] = wssim_uses_df['pos']
final_uses['dataset'] = wssim_uses_df['dataset']
final_uses['grouping'] = wssim_uses_df['grouping']
final_uses['lemma'] = wssim_uses_df['lemma']
final_uses['indexes_target_token_tokenized'] = wssim_uses_df['indexes_target_token_tokenized']
final_uses['indexes_target_sentence_tokenized'] = wssim_uses_df['indexes_target_sentence_tokenized']
final_uses['date'] = " "
final_uses['description'] = " "

In [48]:
final_uses = final_uses[['lemma', 'pos', 'date', 'grouping','identifier', 'description', 'context', 'indexes_target_token', 'indexes_target_sentence', 'context_tokenized', 'indexes_target_token_tokenized', 'indexes_target_sentence_tokenized' ,'dataset']]

In [60]:
os.makedirs( '/content/WSSim', exist_ok=True )

In [61]:
for i in list(final_uses["lemma"].value_counts().index):
  df_temp = final_uses[final_uses["lemma"]==i]
  numpy_df = df_temp.to_numpy()
  header = list(df_temp.columns)
  numpy_df = np.vstack([header, numpy_df])
  if not os.path.exists('/content/WSSim'+"/"+i):
      os.mkdir('/content/WSSim'+"/"+i)
  np.savetxt('/content/WSSim'+"/"+i+"/uses.csv", numpy_df,fmt='%s', delimiter='\t')

In [62]:
wssim_judgemnt_df = pd.DataFrame()            #wssim judgments df
for i in path_j:
   Tmp = pd.read_csv(i, delimiter='\t', quoting = 3)
   Tmp['dataset'] = i.split('/')[0]
   wssim_judgemnt_df = pd.concat([wssim_judgemnt_df, Tmp])


In [None]:
wssim_judgemnt_df.reset_index(drop = True)

In [64]:
for i in list(wssim_judgemnt_df["lemma"].value_counts().index):
  df_temp = wssim_judgemnt_df[wssim_judgemnt_df["lemma"]==i]
  numpy_df = df_temp.to_numpy()
  header = list(df_temp.columns)
  numpy_df = np.vstack([header, numpy_df])
  if not os.path.exists('/content/WSSim'+"/"+i):
      os.mkdir('/content/WSSim'+"/"+i)
  np.savetxt('/content/WSSim'+"/"+i+"/judgments.csv", numpy_df,fmt='%s', delimiter='\t')