In [1]:
import json
import pickle

## Remember our Topic Clustering?

In [2]:
with open('../story_graphs/topic2storyID.json', 'r') as f:
    topics = json.load(f)
print('loaded topics2storyID mapping into memory\n')
    
with open('../story_cluster/clustering_results_min_15_top_words.json', 'r') as f:
    top_words = json.load(f)
print('loaded top 15 topic words into memory')

loaded topics2storyID mapping into memory

loaded top 15 topic words into memory


![CLUSTER](../story_cluster/story_cluster.png)

In [3]:
top_words = {int(key):val for (key,val) in top_words.items()}
for (topic,words) in sorted(top_words.items()):
    print('ID: ', topic,'\n')
    for word in words[:5]:
        print(word[0])
    print('\n'+'-'*20+'\n')

ID:  -1 

went
day
got
decided
friends

--------------------

ID:  0 

zoo
animals
monkey
lion
kids

--------------------

ID:  1 

dog
puppy
dogs
pet
ran

--------------------

ID:  2 

coffee
cup
drink
lid
spilled

--------------------

ID:  3 

dentist
shave
tooth
beard
teeth

--------------------

ID:  4 

snow
cold
outside
winter
ice

--------------------

ID:  5 

tree
wood
squirrel
build
yard

--------------------

ID:  6 

cat
kitten
cats
kittens
sarah

--------------------

ID:  7 

cd
song
music
headphones
listen

--------------------

ID:  8 

flu
medicine
better
doctor
feeling

--------------------

ID:  9 

sleep
asleep
night
fell
moon

--------------------

ID:  10 

bike
car
tire
driving
riding

--------------------

ID:  11 

clean
clothes
trash
laundry
wash

--------------------

ID:  12 

toy
birthday
wagon
party
billy

--------------------

ID:  13 

apple
banana
apples
tree
fruit

--------------------

ID:  14 

team
basketball
game
play
day

--------------------

I

In [4]:
animal_related_cluster = ['0','1','6']
fish_cluster = ['22','29','30']
food_cluster = ['13','17','19','25','27','28']
other_cluster = ['5','10','4']

In [5]:
def get_graphs_per_cluster(cluster):
    files = topics[cluster]
    graphs = []
    for file in files:
        with open('../story_graphs/'+file, 'rb') as f:
            graphs.append(pickle.load(f))
    return(graphs)
            
#zoo_graphs = get_graphs_per_cluster('0')

In [9]:
#graph = zoo_graphs[0]

In [10]:
def print_graph_info(graph):
    print('The graph has the following ', len(graph.nodes), ' nodes:\n')
    for node in graph.nodes:
        print(node)
        print(graph.nodes[node])
        print('\n')
    print('\nThe graph has the following ', len(graph.edges), ' edges:\n')
    for edge in graph.edges:
        print(edge)
        print(graph.edges[edge])
        print('\n')
#print_graph_info(graph)

### What to parse?

1. Story Sentences
2. Specific Rule
3. Replaced General Rule without brackets



In [6]:
def collect_2parse(graph):
    specific = []
    general = []
    for node in graph.nodes:
        if '_S' in node:
            specific += graph.nodes[node]['2parse']
    for edge in graph.edges:
        if '_S' in edge[0]:
            continue
        for annotation in graph.edges[edge]['annotations']:
            if None not in annotation['2parse']:
                specific += [annotation['2parse'][0][0], annotation['2parse'][0][2]]
                general += [annotation['2parse'][1][0], annotation['2parse'][1][2]]
    specific = list(set(specific))
    general = list(set(general))
    
    return(specific, general)

In [7]:
def mk_2parse_file(graphs, cluster):
    parse = []
    for graph in graphs:
        specific, general = collect_2parse(graph)
        parse += specific + general
    len_before = len(parse)
    parse = list(set(parse))
    #print(len_before, len(parse))
    print(len_before - len(parse) ,' reductions in cluster ', cluster)
    
    with open('../amr/2parse/c_'+cluster+'.txt', 'w') as f:
        for sent in parse:
            f.writelines(sent+'\n')
    return(parse)
            
#mk_2parse_file(zoo_graphs, '0')    

In [8]:
def get_parses(clusters):
    parses = []
    for cluster in clusters:
        graphs = get_graphs_per_cluster(cluster)
        parse = mk_2parse_file(graphs, cluster)
        parses += parse
    len_before = len(parses)
    parses = list(set(parses))
    print(len_before - len(parses) ,' reductions in animal related clusters')
    print(len(parses),' sentences to parse in animal related clusters')
    return(parses)
#parses = get_parses(fish_cluster)

## Chunking for paralell parsing

In [9]:
def chunk(parses, chunk_size, name):

    chunks = []
    file_len = len(parses)//chunk_size + 1
    print(file_len)
    print(file_len*chunk_size)
    last = 0
    while last<len(parses):
        chunks += [parses[last:last+file_len]]
        #chunks.append([parses[last:last+file_len]])
        last += file_len
    #chunks.append([parses[last:]])
    print(len(chunks))
    #print(chunks[0])
    for chunk in chunks:
        new_name = name + '_chunk_'+str(chunks.index(chunk))+'.txt'
        with open('../amr/2parse/'+new_name, 'w') as f:
            for sent in chunk:
                #print(sent)
                f.writelines(sent+'\n')
        
#chunk(parses, 5, 'animal')

In [11]:
parses = get_parses(food_cluster)
chunk(parses, 5, 'food')

300  reductions in cluster  13
602  reductions in cluster  17
314  reductions in cluster  19
487  reductions in cluster  25
454  reductions in cluster  27
1207  reductions in cluster  28
1701  reductions in animal related clusters
25427  sentences to parse in animal related clusters
5086
25430
5
