In [None]:
!pip install hazm
!pip install zeep

In [1]:
from hazm import *
import random 
import numpy as np
import nltk
import pandas as pd
import codecs
import tqdm
from __future__ import unicode_literals
from zeep import Client
from requests.auth import HTTPBasicAuth
from requests import Session
from zeep.transports import Transport
from __future__ import unicode_literals
from parstdex import Parstdex
from nltk.tokenize import RegexpTokenizer

In [2]:
model_loc = '../Resources/postagger.model'
path_dir = "../"

tagger = POSTagger(model=model_loc)

all_events_lists = ['قرار ملاقات', 'مرگ','جنگ','عزل و نصب و استعفا و اتخاب','گفت و گو و مذاکرات و توافق','تحریم و رفع تحریم','بر و باخت و تساوی', 'کسب مدال','تغییر قیمت','واردات و صادرات']
ilegal = ['TIME', "EVENT", "COMMUNICATION", "PERSON"]

alphabet = ["آ","ا","ب","پ","ت","ث","ج","چ",
            "ح","خ","د","ذ","ر","ز","ژ","س",
            "ش","ص","ض","ط","ظ","ع","غ",
            "ف","ق","ک","گ","ل","م","ن"
            ,"و","ه","ی","ھ","ئ","أ","ء","ؤ"]

In [3]:
def normalize_inputs(sentences_, events_ = None):
    normalizer = Normalizer()
    sentences_normalized_by_sentence = [normalizer.normalize(' '.join(x)) for x in tqdm.tqdm(sentences_)]
    sentences_normalized_by_word = [[normalizer.normalize(y) for y in x.strip().split()] for x in tqdm.tqdm(sentences_normalized_by_sentence)]
    if events_:
        events_normalized = [[normalizer.normalize(y) for y in x] for x in tqdm.tqdm(events_)]
        return sentences_normalized_by_word, events_normalized
    return sentences_normalized_by_word


def tokenize_inputs(sentences_, events_=None):
    tokenizer = RegexpTokenizer(r'[\w|\u200c]+')
    sentences_no_punctuation = [tokenizer.tokenize(' '.join(sent)) for sent in sentences_ ]
    temp_sentences_tokens = [[word_tokenize(sent) for sent in sents] for sents in tqdm.tqdm(sentences_no_punctuation)]
    sentences_tokens = [[word[0] for word in sent if len(word)] for sent in tqdm.tqdm(temp_sentences_tokens)]
    events_tokens = None
    if events_:
        temp_events_tokens = [[word_tokenize(sent) for sent in sents] for sents in tqdm.tqdm(events_)]
        events_tokens = [[(word[0] if len(word) else '') for word in sent] for sent in tqdm.tqdm(temp_events_tokens)]
        return sentences_tokens, events_tokens
    return sentences_tokens


def lemmatize_input(sentences_, events_):
    lemmatizer = Lemmatizer()

    sentences_lemmatied = [[lemmatizer.lemmatize(word) for word in sent if len(word) != 0] for sent in tqdm.tqdm(sentences_)]
    events_lemmatied = [[lemmatizer.lemmatize(word) for word in sent if len(word) != 0] for sent in tqdm.tqdm(events_)]

    return sentences_lemmatied, events_lemmatied 

In [4]:
def connect_to_fars_net(username_, token_):
    # address of FarsNet's web services
    wsdl_sense_service = 'http://nlp.sbu.ac.ir:8180/WebAPI/services/SenseService?WSDL'
    wsdl_synset_service = 'http://nlp.sbu.ac.ir:8180/WebAPI/services/SynsetService?WSDL'


    # username and token needed for authentication. You can get this token by signing up on http://farsnet.nlp.sbu.ac.ir
    username = username_
    token = token_
    # token = 'd428eab3-3b91-11eb-8a1e-080027d731c1'

    # connecting client
    session = Session()
    session.auth = HTTPBasicAuth(username, token)
    client_sense_service = Client(wsdl_sense_service, transport=Transport(session=session))
    client_synset_service = Client(wsdl_synset_service, transport=Transport(session=session))

    return client_sense_service, client_synset_service

In [5]:
def get_synset_id(sentences_, client_synset_service_, token_):
    return [[client_synset_service_.service.getSynsetsByWord(token_, 'EXACT', word) for word in sent] for sent in tqdm.tqdm(sentences_)]

In [6]:

username='NimaSalem'
token='d428ea27-3b91-11eb-8a1e-080027d731c1'

client_sense_service, client_synset_service = connect_to_fars_net(username, token)

In [None]:
# event type text

In [7]:
def count_similar_events(synset_id_input, events_synset_id_list_input):
    
    """this function compares the id of words in sentences with id of words in events"""

    count_list = []
    for event_synset_id_list in events_synset_id_list_input:
        count = 0
        for word_synset_id_list in event_synset_id_list:
            if synset_id_input in word_synset_id_list:
                count += 1
        count_list.append(count)

    return count_list


def count_similar_sentences(sentences_synset_id_list_input, events_synset_id_list_input):

    """this function creates a list of similarity scores which contains similarity between each event and each word"""

    count_list = []
    for sentence_synset_id_list in sentences_synset_id_list_input:
        count_list_sentence = []
        for word_synset_id_list in sentence_synset_id_list:
            count_list_word = []
            for word_synset_id in word_synset_id_list:
                count_list_word.append(count_similar_events(word_synset_id, events_synset_id_list_input))
            count_list_sentence.append(count_list_word)
        count_list.append(count_list_sentence)

    return count_list


def calculate_word_event_similarity(similarity_list_input):

    """this function calculates the similarity score of each word with each event"""

    similarity_list_output = []
    for sentence in similarity_list_input:
        sentence_similarity_list = []
        for word in sentence:
            word_similarity_list = np.array([0,0,0,0,0,0,0,0,0,0])
            for id in word:
                word_similarity_list += id
            sentence_similarity_list.append(word_similarity_list)
        similarity_list_output.append(sentence_similarity_list)
    
    return similarity_list_output
    

def calculate_sentence_event_similarity(similarity_list_input):

    """this function calculates the similarity score of each sentence with each event"""
    
    similarity_list_output = []
    for sentence in similarity_list_input:
        sentence_similarity_list = np.array([0,0,0,0,0,0,0,0,0,0])
        for word in sentence:
            for id in word:
                sentence_similarity_list += id
        similarity_list_output.append(sentence_similarity_list)
    
    return similarity_list_output


def find_main_span(word_event_similarity_list_input):

    """this function finds the top score word of the sentence to find the main span and related desired event"""

    main_span_list = []
    for sentence in word_event_similarity_list_input:
        main_span_list.append(np.argmax([word[np.argmax(word)] for word in sentence]))
    return main_span_list


def convert_word_pos_to_span(sentences_main_word_index_input, sentences_input, sentences_tokens_input):

    """this function converts the word index to its span in the sentence"""

    span_list = []
    for i in range(len(sentences_input)):
        initial_index = ' '.join(sentences_input[i]).find(sentences_tokens_input[i][sentences_main_word_index_input[i]])
        span_list.append([initial_index, initial_index + len(sentences_input[i][sentences_main_word_index_input[i]])])
    return span_list

In [8]:
def longest_noun_sequence(sent_,index_):
    tags_=[]
    string_=tagger.tag(sent_[index_:])
    for x in string_:
        tags_.append( x[1][:1])
    listToStr = ''.join([str(elem) for elem in tags_])   
    tmp=0
    for i in range(0,len(listToStr)+1):
      if listToStr[i:i+1]=='N':
        tmp+=1
      else:
        break

    listToStr2 = ' '.join([str(elem) for elem in sent_[:index_]])  
    listToStr3 = ' '.join([str(elem) for elem in sent_[:index_+tmp]])  
    return [sent_[index_:index_+tmp],[len(listToStr2),len(listToStr3)]]

In [9]:
def get_event(input_string_):
    """getting the sentences and events"""
    
    sentences = [input_string_.strip().split()]
    events = [x.strip().split() for x in tqdm.tqdm(codecs.open(F'{path_dir}/Data/events.txt','rU','utf-8').readlines())]

    sentences_normalized, events_normalized = normalize_inputs(sentences, events)
    sentences_tokenized, events_tokenized = tokenize_inputs(sentences_normalized, events_normalized)
    sentences_lemmatized, events_lemmatized = lemmatize_input(sentences_tokenized, events_tokenized)

    """getting the synset ids of the lemmatized sentences"""
    sentences_synset_id = get_synset_id(sentences_lemmatized, client_synset_service, token)

    """getting the synset ids of the lemmatized events"""
    events_synset_id = get_synset_id(events_lemmatized, client_synset_service, token)

    """extracting the synset ids of the lemmatized texts"""
    sentences_synset_id_list = [[[synset.id for synset in word] for word in sent] for sent in tqdm.tqdm(sentences_synset_id)]
    events_synset_id_list = [[[synset.id for synset in word] for word in sent] for sent in tqdm.tqdm(events_synset_id)]

    similarity_list = count_similar_sentences(sentences_synset_id_list, events_synset_id_list)

    word_event_similarity = calculate_word_event_similarity(similarity_list)
    sentence_event_similarity = calculate_sentence_event_similarity(similarity_list)

    sentences_main_word_index = find_main_span(word_event_similarity)
    related_event = [np.argmax(sentence) for sentence in sentence_event_similarity]
    main_word_span = convert_word_pos_to_span(sentences_main_word_index, sentences, sentences_tokenized)

    span_list = [longest_noun_sequence(sentences_tokenized[sent], sentences_main_word_index[sent]) for sent in range(len(sentences_tokenized))]
    
    return span_list, related_event

In [None]:
## PLACE

In [10]:
def regex_finder(sentence):
    grammar = r"""
      P: {<P|Pe><Ne|N><RES>?<N|AJ|PRO>*(<CONJ><Ne|N><RES>?<N|AJ|PRO>?)*}
    """
    cp = nltk.RegexpParser(grammar)
    return cp.parse(sentence)

In [11]:
def initial_finds(tagger):
    tagger_finds = []
    finds = []
    
    tree=regex_finder(tagger)
    for subtree in tree.subtrees():
        if subtree.label() == 'P':
            tagger_finds.append(subtree.leaves())
    for tagger_find in tagger_finds:
        find = []
        for k in tagger_find[1:]:
            find.append(k[0])
        finds.append(find)
    return finds

In [12]:
def final_finds(initial_finds):
    final_finds = []
    for sub in initial_finds:
        sences = []
        sen = client_synset_service.service.getSynsetsByWord(token, "EXACT", sub[0])
        for seny in sen:
            sences.append(seny['semanticCategory'])
        bad = False
        if not sences:
            bad = True
        if 'LOCATION' not in sences:
            for senc in sences:
                if senc in ilegal:
                    bad = True
                    break
        if not bad:
            final_finds.append(sub)
    if not final_finds:
        return final_finds
    final_finds = final_finds[0]
    new_list = [final_finds[0]]

    for wo in final_finds[1:]:
        if tagger.tag([wo])[0][1] == 'N':
            s = client_synset_service.service.getSynsetsByWord(token, "EXACT", wo)
            ss = []
            for seny in s:
                ss.append(seny['semanticCategory'])
            if not any(ev in ss for ev in ['LOCATION', 'ARTIFACT']) and ss:
                continue
        new_list.append(wo)
                
    return new_list

In [13]:
def get_event_place(sent, topic_span):
    if not topic_span:
        return ""
    words = tokenize_inputs(normalize_inputs([[sent]]))[0]
    taggs = tagger.tag(words)
    finds = initial_finds(taggs)
    finds = final_finds(finds)
    
    if finds:
        return ' '.join(finds)
    return ''

In [None]:
## TIME

In [14]:
def find_leftmost(topic_span,spans):
    output = spans[-1]
    for i in range(len(spans)-2,-1,-1):
        if spans[i][0] <= output[0] and spans[i][1] >= output[1]:
            output = [spans[i][0],spans[i][1]]
        else:
            break
    return output

In [15]:
def check_the_time(string):
    if any(char in string for char in alphabet):
        return True
    elif ":" in string or "/" in string:
        return True
    else:
        return False

In [16]:
def find_timedate(sentence, topic_span):
    if not topic_span:
        return ""
    
    model = Parstdex()
    datetimes = model.extract_span(sentence)['datetime']
    num_timedates = len(datetimes)
    
    if num_timedates == 0:
        return ""
    
    elif  num_timedates == 1:
        if check_the_time(sentence[datetimes[0][0]:datetimes[0][1]]):
            return sentence[datetimes[0][0]:datetimes[0][1]]
        else:
            return ""
        
    if num_timedates > 1 :
        right,left = [],[]
        for span in datetimes:
            if check_the_time(sentence[span[0]:span[1]]):
                if span[1] < topic_span[0]:
                    right.append(span)
                elif span[0] > topic_span[1]:
                    left.append(span)
        if len(right)!=0:
            final_span = find_leftmost(topic_span,right)
            return sentence[final_span[0]:final_span[1]]
        elif len(left)!=0:
            final_span = find_leftmost(topic_span,left)
            return sentence[final_span[0]:final_span[1]]
        else:
            return ""
    return ""

In [17]:
def run(sent):
    event_dict = {"type": None, "text": None, "place": None, "time": None, "span": None}

    event_text_span_list, event_type_num = get_event(sent)
    event_text = ' '.join(event_text_span_list[0][0])
    event_span = event_text_span_list[0][1]
    
    event_dict['text'] = event_text
    event_dict['span'] = event_span
    event_dict['type'] = all_events_lists[event_type_num[0]]
    event_dict["time"] = find_timedate(sent, event_dict['span'])
    event_dict["place"] = get_event_place(sent, event_dict['span'])
    
    
    return event_dict

In [18]:
sents = ['دو هفته از استعفای نخست وزیر بریتانیا در انگلیس بزرگ می گذرد.']

In [19]:
for sent in sents:
    print(run(sent))

100%|███████████████████████████████████████| 10/10 [00:00<00:00, 119837.26it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1855.89it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1281.09it/s]
100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 1780.57it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  5.66it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 18396.07it/s]
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 22203.83it/s]
100%|███████████████████████████████████████| 10/10 [00:00<00:00, 107271.20it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 14364.05it/s]
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 94466.31it/s]
100%|█████████████████████████████████████████████| 1/1 [00:14<00:00, 14.19s/it]
100%|███████████████████████████████████████████| 10/10 [01:53<00:00, 11.35s/it]
100%|███████████████████████

{'type': 'عزل و نصب و استعفا و اتخاب', 'text': 'استعفای نخست وزیر بریتانیا', 'place': 'انگلیس بزرگ', 'time': 'دو هفته', 'span': [10, 37]}
