In [80]:
path_dir = "../"

In [81]:
import random 
import numpy as np
import nltk
import codecs
import tqdm

In [82]:
"""importing the files"""

# default sentences
sentences = [x.strip().split() for x in tqdm.tqdm(codecs.open(F'{path_dir}/Data/test_sentences.txt','rU','utf-8').readlines())]

# housein sentences
# sentences = [x.strip().split() for x in tqdm.tqdm(codecs.open(F'{path_dir}/Data/housein_tests.txt','rU','utf-8').readlines())]

events = [x.strip().split() for x in tqdm.tqdm(codecs.open(F'{path_dir}/Data/events.txt','rU','utf-8').readlines())]

100%|██████████| 9/9 [00:00<00:00, 82420.82it/s]
100%|██████████| 10/10 [00:00<00:00, 95325.09it/s]


Preprocessing functions

In [85]:
from __future__ import unicode_literals
from hazm import *
from nltk.tokenize import RegexpTokenizer

def normalize_inputs(sentences_, events_):
    normalizer = Normalizer()
    sentences_normalized_by_sentence = [normalizer.normalize(' '.join(x)) for x in tqdm.tqdm(sentences_)]
    sentences_normalized_by_word = [[normalizer.normalize(y) for y in x.strip().split()] for x in tqdm.tqdm(sentences_normalized_by_sentence)]
    events_normalized = [[normalizer.normalize(y) for y in x] for x in tqdm.tqdm(events_)]

    return sentences_normalized_by_word, events_normalized


def tokenize_inputs(sentences_, events_):
    tokenizer = RegexpTokenizer(r'[\w|\u200c]+')
    sentences_no_punctuation = [tokenizer.tokenize(' '.join(sent)) for sent in sentences_ ]
    temp_sentences_tokens = [[word_tokenize(sent) for sent in sents] for sents in tqdm.tqdm(sentences_no_punctuation)]
    sentences_tokens = [[word[0] for word in sent if len(word)] for sent in tqdm.tqdm(temp_sentences_tokens)]

    temp_events_tokens = [[word_tokenize(sent) for sent in sents] for sents in tqdm.tqdm(events_)]
    events_tokens = [[(word[0] if len(word) else '') for word in sent] for sent in tqdm.tqdm(temp_events_tokens)]

    return sentences_tokens, events_tokens


def lemmatize_input(sentences_, events_):
    lemmatizer = Lemmatizer()

    sentences_lemmatied = [[lemmatizer.lemmatize(word) for word in sent if len(word) != 0] for sent in tqdm.tqdm(sentences_)]
    events_lemmatied = [[lemmatizer.lemmatize(word) for word in sent if len(word) != 0] for sent in tqdm.tqdm(events_)]

    return sentences_lemmatied, events_lemmatied    


Connecting to farsnet API functions

In [83]:
from zeep import Client
from requests.auth import HTTPBasicAuth
from requests import Session
from zeep.transports import Transport

def connect_to_fars_net(username_, token_):
    # address of FarsNet's web services
    wsdl_sense_service = 'http://nlp.sbu.ac.ir:8180/WebAPI/services/SenseService?WSDL'
    wsdl_synset_service = 'http://nlp.sbu.ac.ir:8180/WebAPI/services/SynsetService?WSDL'


    # username and token needed for authentication. You can get this token by signing up on http://farsnet.nlp.sbu.ac.ir
    username = username_
    token = token_
    # token = 'd428eab3-3b91-11eb-8a1e-080027d731c1'

    # connecting client
    session = Session()
    session.auth = HTTPBasicAuth(username, token)
    client_sense_service = Client(wsdl_sense_service, transport=Transport(session=session))
    client_synset_service = Client(wsdl_synset_service, transport=Transport(session=session))

    return client_sense_service, client_synset_service

def get_synset_id(sentences_, client_synset_service_, token_):
    return [[client_synset_service_.service.getSynsetsByWord(token_, 'EXACT', word) for word in sent] for sent in tqdm.tqdm(sentences_)]

Finding similar event to each sentence functions

In [84]:
def count_similar_events(synset_id_input, events_synset_id_list_input):
    
    """this function compares the id of words in sentences with id of words in events"""

    count_list = []
    for event_synset_id_list in events_synset_id_list_input:
        count = 0
        for word_synset_id_list in event_synset_id_list:
            if synset_id_input in word_synset_id_list:
                count += 1
        count_list.append(count)

    return count_list


def count_similar_sentences(sentences_synset_id_list_input, events_synset_id_list_input):

    """this function creates a list of similarity scores which contains similarity between each event and each word"""

    count_list = []
    for sentence_synset_id_list in sentences_synset_id_list_input:
        count_list_sentence = []
        for word_synset_id_list in sentence_synset_id_list:
            count_list_word = []
            for word_synset_id in word_synset_id_list:
                count_list_word.append(count_similar_events(word_synset_id, events_synset_id_list_input))
            count_list_sentence.append(count_list_word)
        count_list.append(count_list_sentence)

    return count_list


def calculate_word_event_similarity(similarity_list_input):

    """this function calculates the similarity score of each word with each event"""

    similarity_list_output = []
    for sentence in similarity_list_input:
        sentence_similarity_list = []
        for word in sentence:
            word_similarity_list = np.array([0,0,0,0,0,0,0,0,0,0])
            for id in word:
                word_similarity_list += id
            sentence_similarity_list.append(word_similarity_list)
        similarity_list_output.append(sentence_similarity_list)
    
    return similarity_list_output
    

def calculate_sentence_event_similarity(similarity_list_input):

    """this function calculates the similarity score of each sentence with each event"""
    
    similarity_list_output = []
    for sentence in similarity_list_input:
        sentence_similarity_list = np.array([0,0,0,0,0,0,0,0,0,0])
        for word in sentence:
            for id in word:
                sentence_similarity_list += id
        similarity_list_output.append(sentence_similarity_list)
    
    return similarity_list_output


def find_main_span(word_event_similarity_list_input):

    """this function finds the top score word of the sentence to find the main span and related desired event"""

    main_span_list = []
    for sentence in word_event_similarity_list_input:
        main_span_list.append(np.argmax([word[np.argmax(word)] for word in sentence]))
    return main_span_list


def convert_word_pos_to_span(sentences_main_word_index_input, sentences_input, sentences_tokens_input):

    """this function converts the word index to its span in the sentence"""

    span_list = []
    for i in range(len(sentences_input)):
        initial_index = ' '.join(sentences_input[i]).find(sentences_tokens_input[i][sentences_main_word_index_input[i]])
        span_list.append([initial_index, initial_index + len(sentences_input[i][sentences_main_word_index_input[i]])])
    return span_list


Find longest sequence of words functions

In [103]:
tagger = POSTagger(model='../Resources/postagger.model')

def longest_noun_sequence(sent_,index_):
    tags_=[]
    string_=tagger.tag(sent_[index_:])
    for x in string_:
        tags_.append( x[1][:1])
    listToStr = ''.join([str(elem) for elem in tags_])   
    tmp=0
    for i in range(0,len(listToStr)+1):
      if listToStr[i:i+1]=='N':
        tmp+=1
      else:
        break

    listToStr2 = ' '.join([str(elem) for elem in sent_[:index_]])  
    listToStr3 = ' '.join([str(elem) for elem in sent_[:index_+tmp]])  
    return [sent_[index_:index_+tmp],[len(listToStr2),len(listToStr3)]]

In [86]:
sentences_normalized, events_normalized = normalize_inputs(sentences, events)
sentences_tokenized, events_tokenized = tokenize_inputs(sentences_normalized, events_normalized)
sentences_lemmatized, events_lemmatized = lemmatize_input(sentences_tokenized, events_tokenized)

100%|██████████| 9/9 [00:00<00:00, 12927.65it/s]
100%|██████████| 9/9 [00:00<00:00, 2595.31it/s]
100%|██████████| 10/10 [00:00<00:00, 4293.48it/s]
100%|██████████| 9/9 [00:00<00:00, 10894.30it/s]
100%|██████████| 9/9 [00:00<00:00, 52795.43it/s]
100%|██████████| 10/10 [00:00<00:00, 14217.98it/s]
100%|██████████| 10/10 [00:00<00:00, 85250.08it/s]
100%|██████████| 9/9 [00:00<00:00, 69518.85it/s]
100%|██████████| 10/10 [00:00<00:00, 54755.93it/s]


In [88]:
token = 'd428eb00-3b91-11eb-8a1e-080027d731c1'
username = '987654'
client_sense_service, client_synset_service = connect_to_fars_net(username, token)

In [89]:
"""getting the synset ids of the lemmatized sentences"""

sentences_synset_id = get_synset_id(sentences_lemmatized, client_synset_service, token)

100%|██████████| 9/9 [02:17<00:00, 15.23s/it]


In [54]:
"""getting the synset ids of the lemmatized events"""

events_synset_id = get_synset_id(events_lemmatized, client_synset_service, token)

100%|██████████| 10/10 [02:19<00:00, 13.98s/it]


In [90]:
"""extracting the synset ids of the lemmatized texts"""

sentences_synset_id_list = [[[synset.id for synset in word] for word in sent] for sent in tqdm.tqdm(sentences_synset_id)]
events_synset_id_list = [[[synset.id for synset in word] for word in sent] for sent in tqdm.tqdm(events_synset_id)]

100%|██████████| 9/9 [00:00<00:00, 16950.49it/s]
100%|██████████| 10/10 [00:00<00:00, 29351.32it/s]


In [91]:
similarity_list = count_similar_sentences(sentences_synset_id_list, events_synset_id_list)

In [92]:
word_event_similarity = calculate_word_event_similarity(similarity_list)
sentence_event_similarity = calculate_sentence_event_similarity(similarity_list)

sentences_main_word_index = find_main_span(word_event_similarity)
related_event = [np.argmax(sentence) for sentence in sentence_event_similarity]
main_word_span = convert_word_pos_to_span(sentences_main_word_index, sentences, sentences_tokenized)

In [93]:
print("sentences_main_word_index: ", sentences_main_word_index)
print("related_event: ", related_event)
print("main_word_span", main_word_span)

sentences_main_word_index:  [0, 3, 4, 2, 6, 8, 0, 6, 10]
related_event:  [4, 3, 2, 8, 1, 9, 0, 5, 7]
main_word_span [[0, 8], [11, 18], [21, 26], [11, 17], [27, 33], [40, 46], [0, 5], [-1, 4], [51, 54]]


In [110]:
span_list = [longest_noun_sequence(sentences_tokenized[sent], sentences_main_word_index[sent]) for sent in range(len(sentences_tokenized))]
span_list

[[['گفتگوهای', 'صلح', 'اوکراین'], [0, 20]],
 [['استعفای', 'نخست', 'وزیر', 'بریتانیا'], [10, 37]],
 [['حمله'], [19, 24]],
 [['افزایش', 'بهای', 'طلا'], [10, 26]],
 [['درگذشت', 'امیرکبیر'], [26, 42]],
 [['واردات', 'میوه'], [39, 51]],
 [['دیدار', 'پوتین'], [0, 11]],
 [['تحریم\u200cهای'], [32, 42]],
 [['کسب', 'مدال'], [50, 59]]]