# Dataset building pipeline
Riprodurre esattamente gli step utilizzati in predizione

In [1]:
from lib import text_extraction as te
from lib import words as wd
from lib import embedding as em
from gensim.models import Doc2Vec
import glob
import codecs
import json
import pickle
import os
from lib.parallelize import parallelize
import time
import numpy as np

Using TensorFlow backend.


In [2]:
folders = ["../atti_costitutivi/", "../atti_non_costitutivi/", "../verbali_a05/"]
txt_folders =  ["../atti_costitutivi_txt", "../atti_non_costitutivi_txt", "../verbali_a05_txt"]
label_names = ["non_costitutivo", "costitutivo"]
folder_labels = [1, 0, 0] 
gensim_file='../models/gensim_5000_model_with_verb.d2v'
permitted_words_file='../dictionaries/first_5000_words_with_verb_cost.json'
dataset_filename = '../datasets/embedded_docs_test_v1.p'
dataset_filename_word_embedding = '../datasets/word_embedded_docs.p'
do_txt_extraction = True

In [None]:
gensim_model = Doc2Vec.load(gensim_file)
with open(permitted_words_file) as f:
    permitted_words = set(json.load(f))

In [None]:
def extract_txts(filenames):
    txts = (te.extract_text(f) for f in filenames)
    for txt in txts:
        if txt != None and len(txt)>0:
            yield txt

def extract_txts_filenames(filenames):
    txts = (te.extract_text(f) for f in filenames)
    for filename,txt in zip(filenames,txts):
        if (txt != None and len(txt)>0):
            yield filename, txt

def dataset_generator_from_folders(folders, folder_labels, gensim_model, permitted_words, extract_text=False):
    for folder, label in zip(folders, folder_labels):
        filenames = glob.glob(folder+'/*')
        if extract_text:
            txts = extract_txts(filenames)
        else:
            txts = (open(f).read() for f in filenames)
        splitted_txts = (wd.tokenize_doc(txt) for txt in txts)
        embedded_txts = (em.embed_document(gensim_model, doc, permitted_words) for doc in splitted_txts)
        for i, e in enumerate(embedded_txts):
            print(i)
            yield (e, label)   

def dataset_generator_files_word_embedding(files_lists, file_list_labels, reduced_dictionary):
    d = em.DictionaryMapper(reduced_dictionary)
    for filenames, label in zip(files_lists, file_list_labels):
        txts = (open(f).read() for f in filenames)
        tokenized_txts = (wd.word_tokenize_replace(txt) for txt in txts)
        for doc in d.map_to_ints(tokenized_txts):
            yield (doc, label)   
                   
def extract_txts_to_folders(folders, out_folders):
    for folder, out_folder in zip(folders, out_folders):
        filenames = glob.glob(folder+'/*')
        for filename, txt in extract_txts_filenames(filenames):
            print(filename)
            with open(os.path.join(out_folder, os.path.basename(filename))) as o:
                o.write(txt)

In [None]:
full_dataset = list(dataset_generator_from_folders(txt_folders, folder_labels, gensim_model, 
                                                   permitted_words, extract_text= False))

In [None]:
#balancing
non_costitutivi = [(d,l) for d,l in full_dataset if l == 0]
lnc = len(non_costitutivi)
costitutivi = [(d,l) for d,l in full_dataset if l == 1]
lc = len(costitutivi)
minlen = min([lc,lnc])
balanced_dataset = costitutivi[:minlen] + non_costitutivi[:minlen]

In [None]:
docs = [d for d,l in balanced_dataset]
labels = [l for d,l in balanced_dataset]
with open(dataset_filename, "w") as fout:
        pickle.dump([docs, labels], fout)

In [None]:
if do_txt_extraction:
    extract_txts_to_folders(folders, txt_out_folders)

In [None]:
len(docs)

## Build dataset for word embedding

In [7]:
maxwords = 9998
start = time.time()
cost_texts = (open(filename).read() for filename in glob.glob("../atti_costitutivi_txt/*"))
tokenized_costs = [wd.word_tokenize_replace(txt) for txt in cost_texts]
print(time.time()-start)
dm = em.DictionaryMapper()
dm.fit_texts(tokenized_costs, maxwords)

464.042852163


In [None]:
with open("reduced_dictionary_cost.json", 'w') as o:
    json.dump(dm.reduced_dictionary, o)

In [None]:
#Start from here if you already have the dictionary
with open("reduced_dictionary_cost.json") as f:
    reduced_dictionary = json.load(f)
    

In [None]:
len(full_dataset)

In [None]:
non_cost_filenames = glob.glob("../atti_non_costitutivi_txt/*")+ glob.glob("../verbali_a05_txt/*")

In [None]:
cost_filenames = glob.glob("../atti_costitutivi_txt/*")[:len(non_cost_filenames)]

In [None]:
full_dataset = list(dataset_generator_files_word_embedding([cost_filenames, non_cost_filenames],
                                                           [1,0], reduced_dictionary))

In [None]:
balanced_dataset = full_dataset

In [None]:
docs = [d for d,l in balanced_dataset]
labels = [l for d,l in balanced_dataset]
with open(dataset_filename_word_embedding, "w") as fout:
        pickle.dump([docs, labels], fout)