In [22]:
from __future__ import print_function
from __future__ import division
from gensim.models import Doc2Vec, Word2Vec
import lib.text_extraction as te
from functools import partial
from lib.parallelize import parallelize
from collections import OrderedDict
import lib.embedding as em
import lib.words as wd
import pandas as pd
import numpy as np
import operator
import codecs
import math
import json
import csv
import os
import re

## Sentencing

In [2]:
test_dir = '../extraction/esempi_descrizioni/'

def list_dir(d):
    return [os.path.join(test_dir, f) for f in os.listdir(d)]

filenames = os.listdir(test_dir)
full_filenames = list_dir(test_dir)

txts = [te.extract_text(f, do_ocr=True, pages=-1) for f in full_filenames]

#txts = parallelize(te.extract_text)(full_filenames, do_ocr=True, pages=-1) #Doesn't work with do_ocr=True

../extraction/esempi_descrizioni/5115873790001.pdf Estrazione testo con Vision API.
../extraction/esempi_descrizioni/02628680346_2081942240001.pdf Estrazione testo con Vision API.


In [3]:
for i, txt in zip(filenames, txts):
    print('Documento ', i)
    print(txt[:1000])

Documento  5120119780001.pdf
^ƚƵĚŝŽEŽƚĂƌŝůĞDŽƌŽŶĞ͘
ϭϬϭϮϭdŽƌŝŶŽͲǀŝĂDĞƌĐĂŶƚŝŶŝϱ
dĞůнϯϵϬϭϭϱϲϮϮϱϮϮͲ&ĂǆнϯϵϬϭϭϱϲϭϮϮϳϭ
ƐŽĐŝĞƚĂƌŝŽΛƐƚƵĚŝŽŵŽƌŽŶĞ͘ŝƚͲǁǁǁ͘ƐƚƵĚŝŽŵŽƌŽŶĞ͘ŝƚ

ĚůĐ

Repertorio n. 72.928

Raccolta n. 10.320

Costituzione della "NEXITY TRENTADUE S.r.l.".

Repubblica Italiana
Il ventuno giugno duemiladiciassette,

in Torino, nel mio Studio in via Mercantini n. 5.
Avanti me, avv. Francesco PENE VIDARI,
Notaio iscritto al Collegio Notarile dei Distretti
Riuniti di Torino e Pinerolo, con residenza in Torino, è personalmente comparso il signor
Christian BRAGAGNOLO, nato a San Remo (IM) il 6 luglio 1977, domiciliato a Torino, corso Galileo Ferraris n. 110,
nella sua qualità di procuratore della
"NEXITY HOLDING ITALIA S.r.l.", con sede in Tori-

no, corso Galileo Ferraris n. 110, col capitale sociale
delle

di

euro

30.010.000,

imprese

-

ufficio

iscritta
di

nel

Torino

regist
Documento  5114243810002-2.pdf
allegato "A" rep. n.28012 e racc. n.13815
STATUTO
Articolo 1 - 

In [3]:
extracted_txts_dir = '../extraction/esempi_extracted/'

for fn, txt in zip(filenames, txts):
    utxt = wd.to_utf8(txt)
    with codecs.open(os.path.join(extracted_txts_dir, fn[:-3])+'txt', 'w', encoding='utf-8') as o:
        o.write(utxt)


In [4]:
sentences_dir = '../extraction/esempi_sentences/'

sentences = [wd.sentences_doc(txt) for txt in txts]

for fn, txt in zip(filenames, sentences):
    with codecs.open(os.path.join(sentences_dir, fn[:-3])+'txt', 'w', encoding='utf-8') as o:
        o.write('\n'.join(s.replace('\n',' ').strip() for s in txt if s.strip()))

## Saving sentences

In [5]:
sentenced_txts = [wd.sentences_doc(txt) for txt in txts]

In [6]:
def sentence_label_csv_empty(filenames, sentenced_txts, csv_out, sep=u'\t', min_len=10):
    csv_out.write(sep.join([u'filename',u'sent_index',u'sentence',u'label']) + u'\n')
    for f, sents in zip(filenames, sentenced_txts):
        for i, sent in enumerate(sents):
            clean_sent = sent.replace(sep,u'').replace(u'\n',u' ').strip()
            if len(clean_sent)>=min_len:
                csv_out.write(sep.join([u'{}',u'{}',u'{}',u'non_rilevante']).format(f,i,clean_sent)+u'\n')
    

In [7]:
csv_filename = '../extraction/sentence_labels.csv'
with codecs.open(csv_filename, 'w', encoding='utf-8') as csv_out:
    sentence_label_csv_empty(os.listdir(test_dir), sentenced_txts, csv_out)

In [8]:
df = pd.read_csv(csv_filename, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

In [9]:
df

Unnamed: 0,filename,sent_index,sentence,label
0,5120119780001.pdf,0,^ E D d D d & Repertorio n 72928 Raccolta n 10...,non_rilevante
1,5120119780001.pdf,1,Repubblica Italiana Il ventuno giugno duemilad...,non_rilevante
2,5120119780001.pdf,2,"Detto signor comparente, della identità person...",non_rilevante
3,5120119780001.pdf,3,- 2 Sede La società ha sede nel Comune di Torino,non_rilevante
4,5120119780001.pdf,4,L'indirizzo della stessa è in corso Galileo Fe...,non_rilevante
5,5120119780001.pdf,5,"- l'acquisto, la vendita, la permuta e la gest...",non_rilevante
6,5120119780001.pdf,6,"- la lottizzazione di aree residenziali, indus...",non_rilevante
7,5120119780001.pdf,7,- la formazione di comparti secondo le normati...,non_rilevante
8,5120119780001.pdf,8,la partecipazione alla costituzione di consorz...,non_rilevante
9,5120119780001.pdf,9,la stipula di convenzioni ed atti d'obbligo pe...,non_rilevante


## Experiments on labelled sentences

In [45]:
def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))


In [4]:
labeled_csv_filename = '../extraction/sentence_manual_labels.csv'

ldf = pd.read_csv(labeled_csv_filename, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

In [5]:
poteri = ldf.loc[ldf['label'] == 'poteri']['sentence']
gestione = ldf.loc[ldf['label'] == 'gestione']['sentence']
split_poteri = [[w.lower() for w in wd.splitted_words_utf8(s)] for s in poteri]
split_gestione = [[w.lower() for w in wd.splitted_words_utf8(s)] for s in gestione]

In [19]:
def word_counts(sentences):
    d = dict()
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = 1
            else:
                d[word] += 1
    return d

def first_n_words(sentences, n):
    wc = word_counts(sentences)
    sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))
    return list(reversed([x for x in sorted_wc[-n:]]))


In [35]:
first_n_words(split_poteri, 50)

[(u'di', 44),
 (u'e', 31),
 (u'poteri', 27),
 (u'per', 24),
 (u'in', 23),
 (u'dei', 23),
 (u'della', 22),
 (u'o', 19),
 (u'i', 19),
 (u'la', 18),
 (u'il', 17),
 (u'amministrazione', 15),
 (u'del', 14),
 (u'che', 14),
 (u'straordinaria', 13),
 (u'con', 13),
 (u'societ\xe0', 13),
 (u'pi\xf9', 12),
 (u'tutti', 12),
 (u'ordinaria', 12),
 (u'legge', 11),
 (u'soci', 10),
 (u'organo', 10),
 (u'amministrativo', 10),
 (u'presente', 10),
 (u'ad', 9),
 (u'suoi', 9),
 (u'ed', 9),
 (u'atto', 8),
 (u'l', 8),
 (u'dall', 7),
 (u'al', 7),
 (u'ai', 7),
 (u'ampi', 7),
 (u'delle', 7),
 (u'\xe8', 7),
 (u'dell', 7),
 (u'Il', 6),
 (u'componenti', 6),
 (u'dalla', 6),
 (u'articolo', 6),
 (u'statuto', 6),
 (u'pu\xf2', 6),
 (u'L', 6),
 (u'le', 6),
 (u'sociale', 6),
 (u'gestione', 6),
 (u'atti', 5),
 (u'quanto', 5),
 (u'facolt\xe0', 5)]

## Tf Idf


In [8]:
def tf(word, sentence):
    '''Term frequency in the sentence'''
    return sum(1 for w in sentence if w == word)/len(sentence)

def idf(word, documents):
    '''Inverse document frequency'''
    D = len(documents)
    den = 1+sum(1 for sentence in documents if word in sentence)
    return math.log(D/den)

def idf_memo(documents):
    '''Defines a memoized version of idf with the given set of documents'''
    word_idf = {}
    def memoized_idf(word):
        if not word in word_idf:
            word_idf[word] = idf(word, documents)
        return word_idf[word]
    return memoized_idf

def tf_idf(word, sentence, sentences):
    '''Simple tf-idf index'''
    return tf(word, sentence)*idf(word, sentences)

def tf_idf_memo(documents):
    '''Defines a memoized version of tf-idf with the given set of documents'''
    idf_memoized = idf_memo(documents)
    def tf_idf_memoized(word, sentence):
        return tf(word, sentence)*idf_memoized(word)
    return tf_idf_memoized
    

In [15]:
split_sentences = [[w.lower() for w in wd.splitted_words_utf8(s)] for s in ldf['sentence']]

tf_idf_memoized = tf_idf_memo(split_sentences)

In [16]:
poteri_words = set(word for sent in split_poteri for word in sent)

poteri_words_mean_tfidf = [(word, np.mean([tf_idf_memoized(word, s) for s in split_poteri])) for word in poteri_words]

In [17]:
sorted_poteri_words = sorted(poteri_words_mean_tfidf, key=operator.itemgetter(1), reverse=True)

In [19]:
def print_parole_tf_idf(sorted_words):
    print(u'{:20}\t{:20}\n'.format(u'parola',u'mean tf-idf'))
    for w, score in sorted_words:
        print(u'{:20}\t{:1.4f}'.format(w,score))

In [20]:
print_parole_tf_idf(sorted_poteri_words)

parola              	mean tf-idf         

poteri              	0.1284
straordinaria       	0.0673
ordinaria           	0.0660
amministrativo      	0.0476
organo              	0.0426
amministrazione     	0.0425
ampi                	0.0395
tutti               	0.0390
atti                	0.0370
i                   	0.0327
per                 	0.0312
dei                 	0.0281
più                 	0.0279
o                   	0.0273
eccezione           	0.0261
società             	0.0246
suoi                	0.0240
della               	0.0236
articolo            	0.0228
amministratori      	0.0221
gestione            	0.0220
determinandone      	0.0218
investito           	0.0213
facoltà             	0.0210
l                   	0.0210
sede                	0.0208
institori           	0.0207
legge               	0.0206
e                   	0.0206
compimento          	0.0202
direttori           	0.0198
limiti              	0.0196
consiglio           	0.0195
procuratori         	0.0191
nomin

In [106]:
most_relevant_poteri_words = OrderedDict(sorted_poteri_words) #[:8]

In [107]:
def score_sentence(sentence, most_relevant_words):
    '''Frequency in sentence dot most relevant words tfidf'''
    counts = OrderedDict((w, 0) for w in most_relevant_words.keys())
    for w in sentence:
        if w in counts:
            counts[w] += 1
    l = len(sentence)
    frequencies = [v/l for v in counts.values()]
    if np.linalg.norm(frequencies) == 0 or np.linalg.norm(most_relevant_words.values()) == 0:
        return 0
    return cosine_similarity(np.array(frequencies), np.array(most_relevant_words.values()))
    #return np.linalg.norm(np.array(most_relevant_words.values())-np.array(frequencies)) 
    #return np.dot(most_relevant_words.values(), frequencies)    

In [108]:
[score_sentence(split_pot, most_relevant_poteri_words) for split_pot in split_poteri]

[0.38570616300558647,
 0.58534772175788674,
 0.32071403807653059,
 0.61738434987566604,
 0.26306916314207046,
 0.40527077620141305,
 0.63526136341885586,
 0.34283687678639624,
 0.36189914226463082,
 0.40431332832667749,
 0.52791809914751731,
 0.38540274558748921,
 0.41595042269581189,
 0.4596390978412494,
 0.37239106999863142,
 0.26618153527642058,
 0.48379300646242757,
 0.36963601392163203,
 0.50993107328187937,
 0.47856048128159984,
 0.49657094962750886,
 0.58162380948226522,
 0.49796853213121023,
 0.47420431140530711,
 0.57498582412756161,
 0.63042180388787272]

In [109]:
score_sentence(split_sentences[1432], most_relevant_poteri_words)

0.27689511679952528

In [110]:
gestione_words = set(word for sent in split_gestione for word in sent)

gestione_words_mean_tfidf = [(word, np.mean([tf_idf_memoized(word, s) for s in split_gestione])) for word in gestione_words]
sorted_gestione_words = sorted(gestione_words_mean_tfidf, key=operator.itemgetter(1), reverse=True)

In [111]:
print_parole_tf_idf(sorted_gestione_words)

parola              	mean tf-idf         

assemblea           	0.0485
controllo           	0.0317
organo              	0.0293
rappresentanza      	0.0274
amministratore      	0.0269
l                   	0.0266
31                  	0.0266
dicembre            	0.0254
presidente          	0.0243
amministrazione     	0.0238
articolo            	0.0230
unico               	0.0224
da                  	0.0221
un                  	0.0221
o                   	0.0215
dei                 	0.0208
dal                 	0.0204
esercizi            	0.0198
chiuderà            	0.0193
si                  	0.0190
primo               	0.0189
società             	0.0187
soci                	0.0182
convocazione        	0.0181
dell                	0.0175
dall                	0.0171
convocata           	0.0168
sociale             	0.0167
legale              	0.0165
sociali             	0.0162
conti               	0.0162
revisione           	0.0162
è                   	0.0161
di                  	0.0154
a    

In [112]:
most_relevant_gestione_words = OrderedDict(sorted_gestione_words)

In [113]:
[score_sentence(split_gest, most_relevant_gestione_words) for split_gest in split_gestione]

[0.3038863673825753,
 0.46664604496921935,
 0.34662686090044803,
 0.35372790859717074,
 0.33727837488428181,
 0.42013215983112162,
 0.32470634647879376,
 0.39172311802364629,
 0.33335720480572439,
 0.28217153710533943,
 0.34309685870847967,
 0.22946878987717789,
 0.35827531808308721,
 0.49472970036370173,
 0.32126578821166674,
 0.27792788251220135,
 0.42171938608835141,
 0.29654566194437559,
 0.31419504714081525,
 0.37140630135121849,
 0.38070540661225749,
 0.25634679051701076,
 0.44532684494529456,
 0.45747018367767989,
 0.4007879623716798,
 0.32222686150334662,
 0.3461823247663886,
 0.34233000951055237,
 0.36444746779115178,
 0.46598085360769281,
 0.27242318468606741,
 0.26339354399984671,
 0.30206505250885779,
 0.43249634647882323,
 0.31732354566431809,
 0.43729233654102256,
 0.39937243200321387,
 0.4716359382652886,
 0.38942921324879815,
 0.42029230240927307,
 0.40259701800356101,
 0.42540959415916474,
 0.2550619309941442,
 0.4301874013978888,
 0.40259701800356101,
 0.3505835998147

## Embeddings test

In [None]:
txts_tokenized = [wd.tokenize_doc(txt) for txt in txts]

In [None]:
reduced_dictionary_filename = 'first_5000_words.json'
with open(reduced_dictionary_filename) as f:
    reduced_dictionary = set(json.load(f))

gensim_model_filename = 'models/gensim_model_5000.d2v'
gensim_model = Doc2Vec.load(gensim_model_filename)

In [None]:
embeddings = [em.embed_document(gensim_model, txt, reduced_dictionary) for txt in txts_tokenized]

In [None]:
sv = partial(em.sentence_vector, model=gensim_model, permitted_words=reduced_dictionary)

In [None]:
p1 = u'All\'organo amministrativo sono conferiti i più ampi poteri, sia per la gestione ordinaria che straordinaria della Società'
p2 = u'ARTICOLO 19 - POTERI DELL\'ORGANO AMMINISTRATIVO 191 L\'organo amministrativo ha tutti i poteri di ordinaria e straordinaria amministrazione'
cosine_similarity(sv(sentence=p1.lower()),sv(sentence=p2.lower())) 

In [None]:
mean_cosines_poteri = [np.mean([cosine_similarity(sv(sentence=split_poteri[i]), sv(sentence=split_poteri[j]))
                               for i in range(len(split_poteri)) if i != j])
                                   for j in range(len(split_poteri))]

In [None]:
mean_cosines_poteri

## Word2Vec


In [None]:
sentences_tokenized = [sent for doc in txts_tokenized for sent in doc] #i should reduce the dictionary also

In [None]:
w2v = Word2Vec(sentences_tokenized, size=100, window=5, min_count=5, workers=4)

In [None]:
w2v.wv['costitutivo']

In [None]:
cosine_similarity(w2v.wv['atto'], w2v.wv['costitutivo'])

In [None]:
w2v.most_similar('poteri')

In [None]:
w2v.most_similar(positive=['atto','costitutivo'])