# Embedding delle Frasi

#### Solo per sperimentazione, usa embedding.py!!!

In [1]:
# This Python file uses the following encoding: utf-8
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from lib import embedding as em
from lib.parallelize import parallelize

import numpy as np
import re
import string
import os
import json
import operator
import pickle

import random
import pandas as pd

Using TensorFlow backend.


###### Creazione del dataset come sottoinsieme bilanciato dei documenti

In [2]:
csv_filename = '../atti2.csv'
model_filename = '../models/gensim_model.d2v'

In [3]:
df = pd.read_csv(csv_filename, encoding='utf-8')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
size_nc = len(df.loc[df['label'] == 'non_costitutivo'].groupby('filename'))
size_nc

19033

#### Permitted words only on costitutivo

In [5]:
gb = df.loc[df['label'] == 'costitutivo'].groupby('filename')
cost_df = pd.concat([ gb.get_group(group) for i,group in enumerate( gb.groups) if i < size_nc ])
del gb

In [None]:
del df

In [None]:
permitted_words = [e[0] for e in em.first_n_words([s.split() for s in cost_df["sentence"]], 5000)]

In [None]:
with open("../dictionaries/first_5000_words_with_verb_cost.json", 'w') as o:
    json.dump(permitted_words, o)

In [None]:
with open("../dictionaries/first_5000_words_with_verb_cost.json") as o:
    permitted_words = set(json.load(o))

#### Balancing

In [None]:
grouped = df.loc[df['label'] == 'costitutivo'].groupby(df["filename"])
dfs = [g[1] for g in list(grouped)[:size_nc]]

In [None]:
grouped_nc = df.loc[df['label'] == 'non_costitutivo'].groupby(df["filename"])
dfs_nc = [g[1] for g in list(grouped_nc)]

In [None]:
df_balanced = pd.concat(dfs + dfs_nc)

In [None]:
del df
del grouped
del dfs
del grouped_nc
del dfs_nc

###### Creazione degli embedding

In [None]:
def build_dictionary(sentences):
    d = dict()
    index = 0
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = index
                index += 1
    return d

def word_counts(sentences):
    d = dict()
    for sentence in sentences:
        for word in sentence:
            if not word in d:
                d[word] = 1
            else:
                d[word] += 1
    return d

def rev_dict(d):
    rd = dict()
    for w,i in d.items():
        rd[i] = w
    return rd


In [None]:
#Sentence iterator for building the gensim model

def iter_sentences(sents):
    i = 0
    for line in sents:
        yield LabeledSentence(line, ['SENT_%s' % i])
        i += 1

# Modello dell'embedding

def build_embedding(sentences, epochs = 10):
    if os.path.exists(model_filename):
        model = Doc2Vec.load(model_filename)
    else:
        model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-5, negative=5, workers=2)
        model.build_vocab(sentences)
        print 'Vocabulary built'
        #for epoch in range(5):
        #    print 'Epoch', epoch
        model.train(sentences, model.corpus_count, epochs = epochs)
        model.save(model_filename)
        print 'Model saved'
    return model

def first_n_words(dictionary, n):
    wc = word_counts(s.split() for s in pd_sentences)
    sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))
    return set(reversed([x[0] for x in sorted_wc[-n:]]))

def substitute_word(word, permitted_words, unknown = 'UNK'):
    return word if word in permitted_words else unknown

def reduced_sentence(sentence, permitted_words):
    return [substitute_word(word, permitted_words) for word in sentence]

def reduce_dictionary(sentences, permitted_words, min_words=2):
    for sentence in sentences:
        new_sentence = reduced_sentence(sentence, permitted_words)
        if len(new_sentence) >= min_words:
            yield new_sentence
            
def sentence_vector(model, sentence, permitted_words):
    return model.infer_vector(reduced_sentence(sentence.split(' '), permitted_words))

In [None]:
d = build_dictionary(s.split() for s in pd_sentences)

In [None]:
len(d)

In [None]:
#first_10000_words = first_n_words(d, 10000)
first_5000_words = first_n_words(d, 5000)

In [None]:
wc = word_counts(s.split() for s in pd_sentences)
sorted_wc = sorted(wc.items(), key=operator.itemgetter(1))

In [None]:
sorted_wc[-8000:-7500]

In [None]:
with open('reduced_dictionary.json','w') as f:
    json.dump(list(first_5000_words), f)
    

In [None]:
with open('first_5000_words.json','w') as f:
    json.dump(list(first_5000_words), f)

In [None]:
first_5000_words

In [None]:
filtered_sentences = reduce_dictionary((s.split() for s in pd_sentences), first_5000_words)
filtered_sentences_list = list(filtered_sentences)

In [None]:
# Definizione del modello

model = build_embedding(list(iter_sentences(filtered_sentences_list)))

In [None]:
model.most_similar('atto')

In [None]:
# Esempio di sentence vector

sv = sentence_vector(model, pd_sentences[551068], first_10000_words)
sv

###### Costruzione del dataset

In [None]:
def build_dataset(model, df, permitted_words):
    filename = ""
    docs = []
    labels = []
    curdoc = []                  # lista delle frasi del documento corrente
    for i in xrange(len(df)):
        row = df.iloc[i] 
        if filename == "":
            filename = row["filename"]
            labels.append(row["label"])
            
        embedding = sentence_vector(model, row['sentence'], permitted_words)
        if filename == row["filename"]:
            curdoc.append(embedding)
        else:
            print "%s with len: %d" % (filename, len(curdoc))
            docs.append(curdoc)
            curdoc = [embedding]
            labels.append(row["label"])
            filename = row['filename']
    if len(curdoc)>0:
        docs.append(curdoc)
    return docs, labels

In [None]:
docs, labels = build_dataset(model, df_balanced, first_10000_words)

In [None]:
label_map = {'costitutivo':1, 'non_costitutivo':0}
labels_n = [label_map[l] for l in labels]

In [None]:
with open("../datasets/embedded_docs.p", "w") as fout:
    pickle.dump([docs, labels_n], fout)

#### New embedding

In [None]:
model =  Doc2Vec.load('../models/gensim_5000_model_with_verb.d2v')

In [None]:
def group_to_list_label(g):
    label = next(iter(g["label"]))
    sents = list(g["sentence"])
    return sents, label

def embed_document_p(doc, model, permitted_words):
    return [sentence_vector(model, sentence, permitted_words) for sentence in doc]

parallel_embed_document = parallelize(embed_document_p)

In [None]:
def build_dataset2(model, df, permitted_words):
    l = [x for x in df.groupby("filename").apply(group_to_list_label)]
    docs = [e[0] for e in l]
    labels = [e[1] for e in l]
    print("Starting to embed")
    embedded_docs = parallel_embed_document(docs, model, permitted_words)
    return embedded_docs, labels

In [None]:
docs, labels = build_dataset2(model, df_balanced, permitted_words)