# Summarization

In [1]:
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

## Constructing a dictionary of nasari vectors for ease of search

In [2]:
nasari = {}
with open('res/dd-small-nasari-15.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for l in lines:
        l = l.split(';')
        nasari[l[1].lower()] = [t.split('_')[0] for t in l[2:]]

## Extracting paragraphs

In [3]:
paragraphs = [] #paragraph
#filename = "Andy-Warhol"
#filename = "Ebola-virus-disease"
#filename = "Life-indoors"
filename = "Napoleon-wiki"
with open('res/text-documents/'+ filename + '.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for i in range(1, len(lines)):
        if lines[i] != '\n':
            paragraphs.append(lines[i].split('\n')[0])#puts all the paragraphs, title included in an array, removing \n if present

## Making them processable(no stop word etc.)

In [4]:
sw = stopwords.words('english')#list of stop words
t = RegexpTokenizer(r'\w+')#tokenizer to ignore punctuation
pp =[] #processable p
for par in paragraphs:
    pp.append([w.lower() for w in t.tokenize(par) if w not in sw])#remove punctuation and stop words

## Extracting vectors

In [5]:
p_contexts = [] #matrix of dictionaries: every paragraph has a list of contexts
for p in pp:
    contexts = []
    for w in p:
        n = 1
        context = {} #using a dict to reduce seek time(list.index = O(n) dict[key] = O(1))
        if w in nasari:
            for t in nasari[w]:
                if t not in context:
                    context[t] = n #key = term, value = index
                    n += 1
            for k in list(context.keys()):#expanding context by appending terms vectors
                if k in nasari:
                    for t in nasari[k]:
                        if t not in context:
                            context[t] = n #key = term, value = index
                            n += 1
            contexts.append(context)
    p_contexts.append(contexts)
p_contexts

[[],
 [{'rose': 1,
   'rise': 2,
   'flower': 3,
   'garden': 4,
   'rosa': 5,
   'hybrid': 6,
   'petal': 7,
   'cultivar': 8,
   'shrub': 9,
   'plant': 10,
   'tea': 11,
   'bloom': 12,
   'meilland': 13,
   'pink': 14,
   'pollen': 15,
   'specie': 16,
   'pollinator': 17,
   'pollination': 18,
   'stamen': 19,
   'streptocarpus': 20,
   'stem': 21,
   'carpel': 22,
   'cyclamen': 23,
   'genus': 24,
   'seed': 25,
   'gardens': 26,
   'ġnien': 27,
   'gardening': 28,
   'botanical garden': 29,
   'gardener': 30,
   'plantation': 31,
   'kew': 32,
   'orchard': 33,
   'design': 34,
   'tree': 35,
   'grow': 36,
   'sepal': 37,
   'nectar': 38,
   'orchid': 39,
   'pollinate': 40,
   'corolla': 41,
   'centimeter': 42,
   'cultigen': 43,
   'cultivated plant': 44,
   'bromeliad': 45,
   'botanical': 46,
   'neoregelia': 47,
   'nomenclature': 48,
   'code': 49,
   'name': 50,
   'taxonomy': 51,
   'wild': 52,
   'fruit': 53,
   'hylmö': 54,
   'cassava': 55,
   'coca': 56,
   'coton

## Computing similarity

In [6]:
def wo(v1, v2): #Word Overlap
    overlap = set(v1.keys()).intersection(set(v2.keys()))
    return 0 if not overlap else sum([1/(v1[t] + v2[t]) for t in overlap])/sum([1/2*i for i in range(1, len(overlap)+1)])

In [7]:
def sim(p1, p2): #Similarity between paragraphs
    max_sim = 0
    for c1 in p1:
        for c2 in p2:
            max_sim = max(max_sim, math.sqrt(wo(c1, c2)))
    return max_sim

In [8]:
scores = [(sim(p_contexts[0], p_contexts[i]), i) for i in range(1, len(p_contexts))]
scores.sort(reverse = True)

In [9]:
scores

[(0, 17),
 (0, 16),
 (0, 15),
 (0, 14),
 (0, 13),
 (0, 12),
 (0, 11),
 (0, 10),
 (0, 9),
 (0, 8),
 (0, 7),
 (0, 6),
 (0, 5),
 (0, 4),
 (0, 3),
 (0, 2),
 (0, 1)]

In [10]:
compression_rates = [10, 20, 30]
for r in compression_rates:
    n = int(len(scores) * r / 100)
    idx = [p[1] for p in scores [:-n]]
    print(idx)
    new_filename = filename + '_sum_' + str(r)
    with open('res/text-documents/' + new_filename + '.txt', 'w', encoding='utf-8') as file:
        file.write(paragraphs[0] + '\n')
        for i in idx:
            file.write(paragraphs[i] + '\n')

[17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]
[17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4]
[17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6]
