In [1]:
from __future__ import print_function
from __future__ import division
from gensim.models import Doc2Vec, Word2Vec
import lib.text_extraction as te
from functools import partial
from lib.parallelize import parallelize
from collections import OrderedDict
import lib.embedding as em
import lib.words as wd
import pandas as pd
import numpy as np
import operator
import codecs
import math
import json
import csv
import os
import re

Using TensorFlow backend.


## Sentencing

In [None]:
test_dir = '../extraction/esempi_descrizioni/'

def list_dir(d):
    return [os.path.join(test_dir, f) for f in os.listdir(d)]

filenames = os.listdir(test_dir)
full_filenames = list_dir(test_dir)

txts = [te.extract_text(f, do_ocr=True, pages=-1) for f in full_filenames]

#txts = parallelize(te.extract_text)(full_filenames, do_ocr=True, pages=-1) #Doesn't work with do_ocr=True

In [None]:
for i, txt in zip(filenames, txts):
    print('Documento ', i)
    print(txt[:1000])

In [None]:
extracted_txts_dir = '../extraction/esempi_extracted/'

for fn, txt in zip(filenames, txts):
    utxt = wd.to_utf8(txt)
    with codecs.open(os.path.join(extracted_txts_dir, fn[:-3])+'txt', 'w', encoding='utf-8') as o:
        o.write(utxt)


In [None]:
sentences_dir = '../extraction/esempi_sentences/'

sentences = [wd.sentences_doc(txt) for txt in txts]

for fn, txt in zip(filenames, sentences):
    with codecs.open(os.path.join(sentences_dir, fn[:-3])+'txt', 'w', encoding='utf-8') as o:
        o.write('\n'.join(s.replace('\n',' ').strip() for s in txt if s.strip()))

## Saving sentences

In [None]:
sentenced_txts = [wd.sentences_doc(txt) for txt in txts]

In [None]:
def sentence_label_csv_empty(filenames, sentenced_txts, csv_out, sep=u'\t', min_len=10):
    csv_out.write(sep.join([u'filename',u'sent_index',u'sentence',u'label']) + u'\n')
    for f, sents in zip(filenames, sentenced_txts):
        for i, sent in enumerate(sents):
            clean_sent = sent.replace(sep,u'').replace(u'\n',u' ').strip()
            if len(clean_sent)>=min_len:
                csv_out.write(sep.join([u'{}',u'{}',u'{}',u'non_rilevante']).format(f,i,clean_sent)+u'\n')
    

In [None]:
csv_filename = '../extraction/sentence_labels.csv'
with codecs.open(csv_filename, 'w', encoding='utf-8') as csv_out:
    sentence_label_csv_empty(os.listdir(test_dir), sentenced_txts, csv_out)

In [None]:
df = pd.read_csv(csv_filename, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

In [None]:
df

In [None]:
labelled_filename = '../extraction/sentence_manual_labels.csv'
dfm = pd.read_csv(labelled_filename, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

## Experiments on labelled sentences

In [4]:
def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

def splitted_words_lower(sentences):
    return [[w.lower() for w in wd.splitted_words_utf8(s)] for s in sentences]


In [3]:
labeled_csv_filename = '../extraction/sentence_manual_labels3.csv'

ldf = pd.read_csv(labeled_csv_filename, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

In [14]:
poteri = ldf.loc[ldf['label'] == 'poteri']['sentence']
gestione = ldf.loc[ldf['label'] == 'gestione']['sentence']
finanziario =  ldf.loc[ldf['label'] == 'finanziario']['sentence']
clausole = ldf.loc[ldf['label'] == 'clausola']['sentence']

splits = {}
splits['poteri'] = splitted_words_lower(poteri)
splits['gestione'] = splitted_words_lower(gestione)
splits['finanziario'] = splitted_words_lower(finanziario)
splits['clausola'] = splitted_words_lower(clausole)

In [6]:
def word_counts(sentences):
    d = dict()
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = 1
            else:
                d[word] += 1
    return d

def first_n_words(sentences, n):
    wc = word_counts(sentences)
    sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))
    return list(reversed([x for x in sorted_wc[-n:]]))


In [None]:
first_n_words(split_poteri, 50)

## Tf Idf


In [7]:
def tf(word, sentence):
    '''Term frequency in the sentence'''
    return sum(1 for w in sentence if w == word)/len(sentence)

def idf(word, documents):
    '''Inverse document frequency'''
    D = len(documents)
    den = 1+sum(1 for sentence in documents if word in sentence)
    return math.log(D/den)

def idf_memo(documents):
    '''Defines a memoized version of idf with the given set of documents'''
    word_idf = {}
    def memoized_idf(word):
        if not word in word_idf:
            word_idf[word] = idf(word, documents)
        return word_idf[word]
    return memoized_idf

def tf_idf(word, sentence, sentences):
    '''Simple tf-idf index'''
    return tf(word, sentence)*idf(word, sentences)

def tf_idf_memo(documents):
    '''Defines a memoized version of tf-idf with the given set of documents'''
    idf_memoized = idf_memo(documents)
    def tf_idf_memoized(word, sentence):
        return tf(word, sentence)*idf_memoized(word)
    return tf_idf_memoized
    

In [8]:
# On all sentences
split_sentences = [[w.lower() for w in wd.splitted_words_utf8(s)] for s in ldf['sentence']]
tf_idf_memoized = tf_idf_memo(split_sentences)

In [9]:
def sorted_tfidf_words(splitted_sentences , tf_idf_memoized):
    words = set(word for sent in splitted_sentences for word in sent)
    words_mean_tfidf = [(word, np.mean([tf_idf_memoized(word, s) for s in splitted_sentences])) for word in words]
    return sorted(words_mean_tfidf, key=operator.itemgetter(1), reverse=True)

In [15]:
sorted_words = {k:sorted_tfidf_words(v, tf_idf_memoized) for k,v in splits.items()}


In [17]:
def print_parole_tf_idf(sorted_words, n = -1):
    print(u'{:20}\t{:20}\n'.format(u'parola',u'mean tf-idf'))
    if n < 0:
        n = len(sorted_words)
    for w, score in sorted_words[:n]:
        print(u'{:20}\t{:1.4f}'.format(w,score))

In [19]:
for k,v in sorted_words.items():
    print('{}:'.format(k))
    print_parole_tf_idf(v, 10)
    print()

clausola:
parola              	mean tf-idf         

recesso             	0.0541
diritto             	0.0317
socio               	0.0300
il                  	0.0274
esclusione          	0.0249
trasferimento       	0.0249
esercitato          	0.0247
prelazione          	0.0240
dalla               	0.0227
che                 	0.0211

poteri:
parola              	mean tf-idf         

poteri              	0.1247
ordinaria           	0.0654
straordinaria       	0.0654
amministrativo      	0.0481
organo              	0.0431
amministrazione     	0.0422
ampi                	0.0393
tutti               	0.0375
atti                	0.0365
i                   	0.0310

gestione:
parola              	mean tf-idf         

assemblea           	0.0484
controllo           	0.0311
organo              	0.0292
rappresentanza      	0.0288
l                   	0.0268
amministratore      	0.0267
31                  	0.0264
dicembre            	0.0254
esercizi            	0.0242
presidente          	0.0241



In [20]:
most_relevant_poteri_words = OrderedDict(sorted_words['poteri']) #[:8]

In [27]:
def score_sentence_frequency(sentence, most_relevant_words):
    '''cosine distance between frequency of words in sentence and the most relevant words tf-idf'''
    counts = OrderedDict((w, 0) for w in most_relevant_words.keys())
    for w in sentence:
        if w in counts:
            counts[w] += 1
    l = len(sentence)
    frequencies = [v/l for v in counts.values()]
    if np.linalg.norm(frequencies) == 0 or np.linalg.norm(most_relevant_words.values()) == 0:
        return 0
    return cosine_similarity(np.array(frequencies), np.array(most_relevant_words.values())) 
    
def score_sentence_tf_idf(sentence, most_relevant_words, tf_idf_memoized):
    '''cosine distance between tf-idf of words in sentence and the most relevant words tf-idf'''
    tfidfs = OrderedDict((w, 0) for w in most_relevant_words.keys())
    for w in sentence:
        tfidfs[w] = tf_idf_memoized(w, sentence)
    if np.linalg.norm(tfidfs.values()) == 0 or np.linalg.norm(most_relevant_words.values()) == 0:
        return 0
    l = min(len(tfidfs), len(most_relevant_words))
    return cosine_similarity(np.array(tfidfs.values())[:l], np.array(most_relevant_words.values())[:l])

In [28]:
[score_sentence_tf_idf(split, OrderedDict(sorted_words['poteri']), tf_idf_memoized) for split in splits['poteri']]

[0.32379071659198394,
 0.59740762729606212,
 0.30764712135823419,
 0.60669772109575748,
 0.21387284004817195,
 0.35332138094439314,
 0.6415739240343038,
 0.32936917946886574,
 0.32770262920227322,
 0.37422635942514959,
 0.52949653034409405,
 0.32299077118895225,
 0.35812915982090548,
 0.40768696066110732,
 0.30846815664977711,
 0.19185333870688018,
 0.45297720514993728,
 0.3143652635965582,
 0.4881908718502061,
 0.42818052651950456,
 0.50531713756295205,
 0.56962872622413585,
 0.51276472884112589,
 0.39540870535544703,
 0.5846872280416926,
 0.61897238299021196]

In [33]:
score_sentence_tf_idf(split_sentences[1432], OrderedDict(sorted_words['poteri']), tf_idf_memoized)

0.15789826365293538

## Embeddings test

In [None]:
txts_tokenized = [wd.tokenize_doc(txt) for txt in txts]

In [None]:
reduced_dictionary_filename = 'first_5000_words.json'
with open(reduced_dictionary_filename) as f:
    reduced_dictionary = set(json.load(f))

gensim_model_filename = 'models/gensim_model_5000.d2v'
gensim_model = Doc2Vec.load(gensim_model_filename)

In [None]:
embeddings = [em.embed_document(gensim_model, txt, reduced_dictionary) for txt in txts_tokenized]

In [None]:
sv = partial(em.sentence_vector, model=gensim_model, permitted_words=reduced_dictionary)

In [None]:
p1 = u'All\'organo amministrativo sono conferiti i più ampi poteri, sia per la gestione ordinaria che straordinaria della Società'
p2 = u'ARTICOLO 19 - POTERI DELL\'ORGANO AMMINISTRATIVO 191 L\'organo amministrativo ha tutti i poteri di ordinaria e straordinaria amministrazione'
cosine_similarity(sv(sentence=p1.lower()),sv(sentence=p2.lower())) 

In [None]:
mean_cosines_poteri = [np.mean([cosine_similarity(sv(sentence=split_poteri[i]), sv(sentence=split_poteri[j]))
                               for i in range(len(split_poteri)) if i != j])
                                   for j in range(len(split_poteri))]

In [None]:
mean_cosines_poteri

## Word2Vec


In [None]:
sentences_tokenized = [sent for doc in txts_tokenized for sent in doc] #i should reduce the dictionary also

In [None]:
w2v = Word2Vec(sentences_tokenized, size=100, window=5, min_count=5, workers=4)

In [None]:
w2v.wv['costitutivo']

In [None]:
cosine_similarity(w2v.wv['atto'], w2v.wv['costitutivo'])

In [None]:
w2v.most_similar('poteri')

In [None]:
w2v.most_similar(positive=['atto','costitutivo'])