# Text Segmentation

In [59]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np

## Loading and Sentence Splitting

In [60]:
with open('res/TextToTile.txt') as file:
    sents = sent_tokenize(file.read())

## Preprocessing

In [62]:
stopwords = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()
psents = []
for sent in sents:
    psents.append([stemmer.stem(w.lower()) for w in tokenizer.tokenize(sent) if not w in stopwords])

## Sentence Transformation

In [63]:
# Dictionary creation
words = set()
dictionary = {}
for sent in psents:
    words.update(sent)
words = list(words)
for i in range(len(words)):
    dictionary[words[i]] = i

In [64]:
# Sentence transformtion
for i in range(len(psents)):
    frequencies = [0] * len(dictionary)
    for word in psents[i]:
        frequencies[dictionary[word]] += 1
    psents[i] = frequencies

## Splitting

In [65]:
def cos_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [99]:
# separators and split frequencies
separators = [i * 10 for i in range(1, round(len(sents) / 10))]
splits = []
start = 0
for s in separators:
    splits.append(np.sum(psents[start:s], 0))
    start = s
splits.append(np.sum(psents[start:], 0))

In [100]:
improvement = True
while(improvement):
    improvement = False
    for i in range(len(separators)):
        siml = cos_sim(psents[separators[i] - 1], splits[i + 1])
        simr = cos_sim(psents[separators[i]], splits[i])
        if siml <= cos_sim(psents[separators[i] - 1], np.subtract(splits[i], psents[separators[i] - 1])):
            siml = 0
        if simr <= cos_sim(psents[separators[i]], np.subtract(splits[i + 1], psents[separators[i]])):
            simr = 0
        print(siml, simr)
        if siml != 0 or simr != 0:
            if(siml >= simr):
                splits[i] = np.subtract(splits[i], psents[separators[i] - 1])
                splits[i + 1] = np.add(splits[i + 1], psents[separators[i] - 1])
                separators[i] -= 1
            else:
                splits[i] = np.add(splits[i], psents[separators[i]])
                splits[i + 1] = np.subtract(splits[i + 1], psents[separators[i]])
                separators[i] += 1
            improvement = True

0 0.09534625892455922
0 0.3089571903266623
0.23371317622140134 0.11605177063713189
0.0236227795630767 0.18663083698528476
0 0.13825031590233275
0.05001563232803554 0.21266436150250076
0.10826639239215337 0.026785981207297003
0.10369516947304255 0
0 0
0 0.18857036045412787
0 0.2631174057921088
0.08362420100070908 0
0 0
0 0.13006649542861798
0 0.1739313106957345
0.17937941173235003 0
0 0
0 0
0 0
0.0882230841550732 0
0 0.10724550081047436
0 0
0 0.054312544659356844
0 0
0.06604581866838263 0
0 0
0 0.173421993904824
0 0.11952286093343938
0 0
0 0
0 0
0 0.15167191839776337
0.05559369874958259 0
0 0
0 0
0 0
0.21160368475757949 0
0 0
0 0
0 0.14616085094950185
0.09166984970282112 0
0.18764665626020038 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0.07733602811121826 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0.0576629931704892 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0


In [106]:
start = 0
for i in range(len(separators)):
    print('Split', i + 1, ':')
    print(sents[start:separators[i]])
    start = separators[i]
print('Split', len(separators) + 1, ':')
print(sents[start:])

Split 1 :
['ï»¿As I write these words, I am sitting in my quiet study in Sussex, England, looking out over the rose-garden towards the belt of trees which shields us from the sea.', 'There is the gentle breeze so familiar in Selsey, but nothing more.', 'Yet it has been claimed that if the Earth were spinning round, as conventional scientists claim, there would be a howling gale all the time.', 'To see just how this theory works, we must go back almost two thousand years - in fact to the second century A.D., when the most famous scientist in the world was Claudius Ptolemaus, better known as Ptolemy.', 'We know very little about his life, except that he flourished from around A.D. 120 to 180; that he lived in Alexandria, and that he belonged to the Greek school of thought.', 'I-le was an expert astronomer and mathematician, and also a geographer; his map of the known world was remarkably good, even though he did join Scotland on to England in a sort of back-to-front position.', 'Also, he