# Embedding delle Frasi

In [1]:
# This Python file uses the following encoding: utf-8
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec

import numpy as np
import re
import string
import os
import json
import operator
import pickle

import random
import pandas as pd

Using TensorFlow backend.


###### Creazione del dataset come sottoinsieme bilanciato dei documenti

In [2]:
csv_filename = '../atti.csv'
model_filename = 'models/gensim_model.d2v'

In [3]:
df = pd.read_csv(csv_filename, encoding='utf-8')

In [4]:
df

Unnamed: 0,filename,sentence,label
0,5049618730001,repertorio n NUM raccolta n NUM atto costituti...,costitutivo
1,5049618730001,codice fiscale bnd dnl ALPHANUM ALPHANUM detto...,costitutivo
2,5049618730001,i e costituita società responsabilità limitata...,costitutivo
3,5049618730001,ii la sede società fissata comune modena,costitutivo
4,5049618730001,ai soli fini iscrizione registro imprese compa...,costitutivo
5,5049618730001,b progettare erogare corsi formazione qualsivo...,costitutivo
6,5049618730001,erogare servizi tipo doposcuola rivolti studen...,costitutivo
7,5049618730001,d erogare prestazioni individuali collettive m...,costitutivo
8,5049618730001,progettare erogare prestazioni consulenze trai...,costitutivo
9,5049618730001,tali prestazioni potranno essere erogate forma...,costitutivo


In [5]:
size_nc = len(df.loc[df['label'] == 'non_costitutivo'].groupby('filename'))
size_nc

16718

In [6]:
grouped = df.loc[df['label'] == 'costitutivo'].groupby(df["filename"])
dfs = [g[1] for g in list(grouped)[:size_nc]]

In [7]:
grouped_nc = df.loc[df['label'] == 'non_costitutivo'].groupby(df["filename"])
dfs_nc = [g[1] for g in list(grouped_nc)]

In [8]:
df_balanced = pd.concat(dfs + dfs_nc)

In [9]:
df_balanced.loc[df['filename'] == 5122462300001]

Unnamed: 0,filename,sentence,label


In [10]:
pd_sentences = df_balanced['sentence']

###### Creazione degli embedding

In [11]:
def build_dictionary(sentences):
    d = dict()
    index = 0
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = index
                index += 1
    return d

def word_counts(sentences):
    d = dict()
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = 1
            else:
                d[word] += 1
    return d

def rev_dict(d):
    rd = dict()
    for w,i in d.items():
        rd[i] = w
    return rd


In [33]:
#Sentence iterator for building the gensim model

def iter_sentences(sents):
    i = 0
    for line in sents:
        yield LabeledSentence(line, ['SENT_%s' % i])
        i += 1

# Modello dell'embedding

def build_embedding(sentences, epochs = 10):
    if os.path.exists(model_filename):
        model = Doc2Vec.load(model_filename)
    else:
        model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-5, negative=5, workers=2)
        model.build_vocab(sentences)
        print 'Vocabulary built'
        #for epoch in range(5):
        #    print 'Epoch', epoch
        model.train(sentences, model.corpus_count, epochs = epochs)
        model.save(model_filename)
        print 'Model saved'
    return model

def first_n_words(dictionary, n):
    wc = word_counts(s.split() for s in pd_sentences)
    sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))
    return set(reversed([x[0] for x in sorted_wc[-n:]]))

def substitute_word(word, permitted_words, unknown = 'UNK'):
    return word if word in permitted_words else unknown

def reduced_sentence(sentence, permitted_words):
    return [substitute_word(word, permitted_words) for word in sentence]

def reduce_dictionary(sentences, permitted_words, min_words=2):
    for sentence in sentences:
        new_sentence = reduced_sentence(sentence, permitted_words)
        if len(new_sentence) >= min_words:
            yield new_sentence
            
def sentence_vector(model, sentence, permitted_words):
    return model.infer_vector(reduced_sentence(sentence.split(' '), permitted_words))

In [13]:
d = build_dictionary(s.split() for s in pd_sentences)

In [14]:
len(d)

234631

In [15]:
#first_10000_words = first_n_words(d, 10000)
first_5000_words = first_n_words(d, 5000)

In [21]:
wc = word_counts(s.split() for s in pd_sentences)
sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))

In [28]:
sorted_wc[-8000:-7500]

[(u'incapacit\xe0', 254),
 (u'spoleto', 254),
 (u'posseduto', 254),
 (u'txrwd', 254),
 (u'ferrario', 255),
 (u'special', 255),
 (u'park', 255),
 (u'temi', 255),
 (u'glc', 255),
 (u'insindacabile', 255),
 (u'didattico', 255),
 (u'edificio', 255),
 (u'consimili', 255),
 (u'cernusco', 255),
 (u'rho', 255),
 (u'prefabbricate', 255),
 (u'las', 255),
 (u'sigilli', 255),
 (u'adolfo', 255),
 (u'causare', 255),
 (u'abbattimento', 255),
 (u'deputato', 255),
 (u'pinto', 256),
 (u'indu', 256),
 (u'santi', 256),
 (u'tp', 256),
 (u'valorizzare', 256),
 (u'vino', 256),
 (u'rivendica', 256),
 (u'dt', 256),
 (u'uscita', 256),
 (u'nistrativo', 256),
 (u'sardegna', 256),
 (u'angelis', 256),
 (u'sussidiarie', 256),
 (u'assuma', 256),
 (u'vv', 256),
 (u'negozia', 257),
 (u'frqwudwwl', 257),
 (u'gpl', 257),
 (u'nc', 257),
 (u'dietetici', 257),
 (u'balsamo', 257),
 (u'multipli', 257),
 (u'contenimento', 257),
 (u'adele', 257),
 (u'seguendo', 257),
 (u'gene', 257),
 (u'ufficiali', 257),
 (u'residenti', 257),


In [17]:
with open('reduced_dictionary.json','w') as f:
    json.dump(list(first_5000_words), f)
    

In [29]:
with open('first_5000_words.json','w') as f:
    json.dump(list(first_5000_words), f)

In [18]:
first_5000_words

{u'motivazioni',
 u'disp',
 u'unica',
 u'utilizzate',
 u'tesorerie',
 u'formalmente',
 u'comunque',
 u'motivazione',
 u'unico',
 u'direttive',
 u'convegni',
 u'gdood',
 u'francesco',
 u'assemblea',
 u'francesca',
 u'inps',
 u'favorevoli',
 u'assemblee',
 u'udito',
 u'parcheggi',
 u'presenta',
 u'presente',
 u'egli',
 u'determinando',
 u'plastiche',
 u'presenti',
 u'riccardo',
 u'seguire',
 u'considera',
 u'firmato',
 u'siggri',
 u'fronte',
 u'aut',
 u'firmati',
 u'innovativi',
 u'stipulati',
 u'concernente',
 u'consortili',
 u'commerce',
 u'riunita',
 u'docu',
 u'consortile',
 u'bancari',
 u'partecipanti',
 u'dell',
 u'implementazione',
 u'concernenti',
 u'cinquecentomila',
 u'rappresentano',
 u'approvazione',
 u'consiste',
 u'scelto',
 u'chiusura',
 u'chiedono',
 u'emanate',
 u'formata',
 u'udita',
 u'decimo',
 u'soddisfazione',
 u'clientela',
 u'attilio',
 u'liquide',
 u'delegata',
 u'piva',
 u'osservazioni',
 u'amministrazione',
 u'duemiladiciassette',
 u'delegati',
 u'vincenzo',
 u

In [32]:
filtered_sentences = reduce_dictionary((s.split() for s in pd_sentences), first_5000_words)
filtered_sentences_list = list(filtered_sentences)

In [23]:
# Definizione del modello

model = build_embedding(list(iter_sentences(filtered_sentences_list)))

Vocabulary built
Model saved


In [24]:
model.most_similar('atto')

[('presente', 0.8062084913253784),
 ('apportare', 0.7802978754043579),
 ('soppressioni', 0.7477972507476807),
 ('allega', 0.7333101034164429),
 ('aggiunte', 0.7322391271591187),
 ('statuto', 0.731306791305542),
 ('integrante', 0.7226213216781616),
 ('regolata', 0.7120342254638672),
 ('forza', 0.7065362930297852),
 ('allegano', 0.7060737609863281)]

In [48]:
# Esempio di sentence vector

sv = sentence_vector(model, pd_sentences[551068], first_10000_words)
sv

array([-0.00877847,  0.08421552, -0.00934239, -0.06026119, -0.0486962 ,
       -0.10909535,  0.01427933,  0.0382507 ,  0.06857342, -0.05685881,
       -0.01122126,  0.05149804, -0.07070133,  0.01299185,  0.08797149,
       -0.02599948,  0.00463746, -0.02856188,  0.00580665,  0.01292427,
        0.0679419 ,  0.0067199 ,  0.00769502,  0.07473454,  0.02272988,
       -0.02177911, -0.05640026,  0.08370669,  0.01138843, -0.07007347,
       -0.08425567, -0.00059223, -0.0390888 ,  0.0140768 , -0.11788081,
        0.01243659, -0.06523187,  0.02117974, -0.00639699, -0.00492997,
        0.01472021, -0.0201715 ,  0.00113072,  0.01332173, -0.02123491,
       -0.01164006, -0.00858516,  0.06819962, -0.02509951,  0.02500732,
        0.0713005 , -0.09450735, -0.04908381,  0.03631534, -0.08748867,
        0.06206969,  0.00515663,  0.02944721,  0.0006274 ,  0.0327379 ,
        0.0844673 , -0.06255493, -0.03983356, -0.07354014, -0.02518529,
        0.05504263,  0.06384774, -0.05127484, -0.00868062,  0.00

###### Costruzione del dataset

In [49]:
def build_dataset(model, df, permitted_words):
    filename = ""
    docs = []
    labels = []
    curdoc = []                  # lista delle frasi del documento corrente
    for i in xrange(len(df)):
        row = df.iloc[i] 
        if filename == "":
            filename = row["filename"]
            labels.append(row["label"])
            
        embedding = sentence_vector(model, row['sentence'], permitted_words)
        if filename == row["filename"]:
            curdoc.append(embedding)
        else:
            print "%s with len: %d" % (filename, len(curdoc))
            docs.append(curdoc)
            curdoc = [embedding]
            labels.append(row["label"])
            filename = row['filename']
    if len(curdoc)>0:
        docs.append(curdoc)
    return docs, labels

In [2]:
docs, labels = build_dataset(model, df_balanced, first_10000_words)

In [53]:
label_map = {'costitutivo':1, 'non_costitutivo':0}
labels_n = [label_map[l] for l in labels]

In [58]:
with open("embedded_docs.p", "w") as fout:
    pickle.dump([docs, labels_n], fout)