# Punctuation in novels
Inspired by https://medium.com/@neuroecology/punctuation-in-novels-8f316d542ec4#.qwj8e1n8m

In [1]:
import string
import collections
from PIL import Image, ImageDraw
from math import ceil

The `string` module has some nice subsets of characters. Does it know about punctuation?

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Getting the punctuation
First, let's just open a text file and read the punctuation. We can also count the number of different punctuation characters in it.

In [20]:
sherlock = open('sherlock-holmes.txt').read()

In [21]:
sherlock_punct = [c for c in sherlock if c in string.punctuation]
print(''.join(sherlock_punct))

..-.......'.........,,,.,,,.,.-'..,-,.,,...,-,,,,,,,,.,,,,.:,,.,,,.-,-(,.-,,,,.,,,,.,,.,,..-...;,,.,,,,.."".",,""".",.,,.,."","",.,""",".,.,'.,,,,,",."";",,..,,,-.,,,-,,,"."",",.""",,.",..,"""""",""""?'""!...,,.--,,,",--.""."",."-,'","...,"""."""..,..",."",'.".""-".".",""""""""""."".",;,"".'''''''''''',''''".",-,.--,.',--',,,",.""."..-''..,,.,,"',..","."",."..',,"","",....""-"".,..,,",,.."".,.,,.-,-.,,.-,,,,,.,,,,.""."",."".",.,."",.,,,.,",.",".""."",";."""."""".",""".",.,,""",.,.."","".,,."";"."",".",-,""",,".."",",.":,,-,."",".,.--.""."!.-!,!-!-!-!,,,,"".-"""""""""".,"""",!""-"""""""""""""",!!""-""""..""""""."""",""....""""".""."".",.""""""""""-,...""""...,.,.,-""""""""."",".",.,,"".""""."","""""""""."".-."'".",,.'"."".""""",-,,.-,",."-'".',.'.,,,.,,,,.,,,,,,,..,-,--,,.',.,-,.,.",",,,."""'.,""'.,,"";.,.'..,..,,,..-,,,.,-.,.",,.,,,,,,""".",'..-,.,,,.,.,.,,,,..,..-,.,,."...,?,,?,.,.,'.,.,,""".",.,,,-.,,.",-,,,..,.,,'','&',..'",,-,,.'.,,.'.,',''",.,.,.'.',''-,.".',...,...,,,.''.''.!'''.

In [22]:
sherlock_counts = collections.Counter(sherlock_punct)
sherlock_counts

Counter({'!': 171,
         '"': 4834,
         '&': 5,
         "'": 1490,
         '(': 5,
         ',': 7053,
         '-': 965,
         '.': 4843,
         '/': 1,
         ':': 56,
         ';': 202,
         '?': 138})

In [25]:
def punct_summarise(fname):
    content = open(fname).read()
    punct = ''.join(c for c in content if c in string.punctuation)
    counts = collections.Counter(punct)
    return {'punctuation': punct, 'counts': counts}

In [26]:
sherlock = punct_summarise('sherlock-holmes.txt')
sherlock['counts']

Counter({'!': 171,
         '"': 4834,
         '&': 5,
         "'": 1490,
         '(': 5,
         ',': 7053,
         '-': 965,
         '.': 4843,
         '/': 1,
         ':': 56,
         ';': 202,
         '?': 138})

In [28]:
wap = punct_summarise('war-and-peace.txt')
wap['counts']

Counter({'!': 3923,
         '"': 17970,
         '#': 1,
         '$': 2,
         '%': 1,
         "'": 7529,
         '(': 670,
         ')': 670,
         '*': 300,
         ',': 39891,
         '-': 6308,
         '.': 30805,
         '/': 29,
         ':': 1014,
         ';': 1145,
         '=': 2,
         '?': 3137,
         '@': 2,
         '[': 1,
         ']': 1})

In [64]:
shakespeare = punct_summarise('shakespeare.txt')
shakespeare['counts']

Counter({'!': 10815,
         '"': 6,
         '&': 10,
         "'": 27942,
         ',': 82750,
         '-': 4590,
         '.': 36881,
         ':': 10649,
         ';': 17400,
         '?': 10327,
         '[': 19,
         ']': 18})

In [90]:
ulysses = punct_summarise('ulysses.txt')
ulysses['counts']

Counter({'!': 1576,
         '"': 8,
         '%': 3,
         '&': 3,
         "'": 4485,
         '(': 1777,
         ')': 1788,
         '*': 90,
         '+': 2,
         ',': 16349,
         '-': 5037,
         '.': 21361,
         '/': 58,
         ':': 2564,
         ';': 34,
         '?': 2235,
         '_': 4566})

In [63]:
line_len = 50
for i in range(5,25):
    print(sherlock['punctuation'][line_len*i:line_len*(i+1)], wap['punctuation'][line_len*i:line_len*(i+1)])

.",."",'.".""-".".",""""""""""."".",;,"".''''''''' ,.",","'..?"".,",,",,,.,..?.?"",".",",,"?.",..",",
''',''''".",-,.--,.',--',,,",.""."..-''..,,.,,"',. .,',.',..,,(),:".?.".",",--".?',.',".".,'."."',"."
.","."",."..',,"","",....""-"".,..,,",,.."".,.,,.- .""';.?"(),"'....",,.."?".",.,..",."?,",."...'!",.
,-.,,.-,,,,,.,,,,.""."",."".",.,."",.,,,.,",.","." ."?".",',.,.",,,.",",,"?,",",?":"'....?""..-,'.',.
"."",";."""."""".",""".",.,,""",.,.."","".,,."";". .;,.--'."",,",'."-,.'.",',,,.",",,"',',,.''."'.:.'
"",".",-,""",,".."",",.":,,-,."",".,.--.""."!.-!,! ,,';.,,*.,,.',,,..*.,",""?",,;,'.,,;.,,,",,.",,.-.
-!-!-!,,,,"".-"""""""""".,"""",!""-"""""""""""""", ,,,,.,-----.,,,.,,,,.,,.,,,,,.",",.",,,",.",."-,-,
!!""-""""..""""""."""",""....""""".""."".",."""""" ,.",,,".",",,"?.?",,,."!".,-,,-,,.,-'.,,..-,,,.,,,
""""-,...""""...,.,.,-""""""""."",".",.,,"".""""." .",,,",.,.,.','.:"?."",,.""?"..,.,,'.",".,,.,,,,,,
","""""""""."".-."'".",,.'"."".""""",-,,.-,",."-'" -,,,..,.,'.,,,.-..,.'..,,,.,,.,

In [87]:
def compare(text1, text2, offset=0, line_len=50):
    for i in range(offset, max(len(text1), len(text2)), line_len):
        t1 = text1[i:i+line_len]
        t1 += (' ' * (line_len - len(t1)))
        print(t1, text2[i:i+line_len])

In [88]:
compare(sherlock['punctuation'], wap['punctuation'])

..-.......'.........,,,.,,,.,.-'..,-,.,,...,-,,,,, ,.,-..::::,[#]:,:******,/:::::-:-:-:-:::::::-:-:",
,,,.,,,,.:,,.,,,.-,-(,.-,,,,.,,,,.,,.,,..-...;,,., ,.,',----,','!?--.",,-,.,,..,,;.,.,,-,:",(),,--.""
,,,.."".",,""".",.,,.,."","",.,""",".,.,'.,,,,,",. !!",.,,,,.,,.,,,,,.",,.',",."??".",?""'?.,".".""'.
"";",,..,,,-.,,,-,,,"."",",.""",,.",..,"""""","""" ."",,",,-,."'!,'?.""?",."?,.",.,,.,,.,,,,,,,,.:",'
?'""!...,,.--,,,",--.""."",."-,'","...,"""."""..,. .',,,.!..!,.,!....,,?...'..,,.?.-,.?!!,....',...!"
.",."",'.".""-".".",""""""""""."".",;,"".''''''''' ,.",","'..?"".,",,",,,.,..?.?"",".",",,"?.",..",",
''',''''".",-,.--,.',--',,,",.""."..-''..,,.,,"',. .,',.',..,,(),:".?.".",",--".?',.',".".,'."."',"."
.","."",."..',,"","",....""-"".,..,,",,.."".,.,,.- .""';.?"(),"'....",,.."?".",.,..",."?,",."...'!",.
,-.,,.-,,,,,.,,,,.""."",."".",.,."",.,,,.,",.","." ."?".",',.,.",,,.",",,"?,",",?":"'....?""..-,'.',.
"."",";."""."""".",""".",.,,""",.,.."","".,,."";". .;,.--'."",,",'."-,.'.",',,,.",

In [89]:
compare(shakespeare['punctuation'], sherlock['punctuation'])

-',,:;!;,,'.;;,-,.,,;;;.,',;,,,.,!,:'?,,.,.,.,:,,' ..-.......'.........,,,.,,,.,.-'..,-,.,,...,-,,,,,
:,,,,'-;,,;',,,,,,,,';'';',,.,,,,,;,,.,?',.,;',,,. ,,,.,,,,.:,,.,,,.-,-(,.-,,,,.,,,,.,,.,,..-...;,,.,
..;,,',.'...,;,,.,.,,;,,',',,',,.,;',,,,.,,,,,.;,, ,,,.."".",,""".",.,,.,."","",.,""",".,.,'.,,,,,",.
-,',,;'.,;,,.',;':.!,,;,.,,',';;',';,,'.?,',',,;,, "";",,..,,,-.,,,-,,,"."",",.""",,.",..,"""""",""""
,,,.,;,--,.,,;,;,.,,',,,,.,:,?,:,..,!??,.!,,;,,!'. ?'""!...,,.--,,,",--.""."",."-,'","...,"""."""..,.
,!'.,!'.,,,,,,,,,,,,,'!':.',:,,,,'.:,,.,,:;.,,,.', .",."",'.".""-".".",""""""""""."".",;,"".'''''''''
'-,,,,,.!',,',,',,,,,-.,.,.!??.:!-!'',,.:!,,,;,,'. ''',''''".",-,.--,.',--',,,",.""."..-''..,,.,,"',.
,,'.!,'.,.!.,.!.,.,.,,.,:!:;.,':!,,'.,.-,',,',''., .","."",."..',,"","",....""-"".,..,,",,.."".,.,,.-
-,,;,.,:;!,:'.,.,:,!'!;?;;,',,.,,.,,'.';:,'.,';'', ,-.,,.-,,,,,.,,,,.""."",."".",.,."",.,,,.,",.","."
';,',.':-;,:,.?,,.',,,-.,,;,.,,,.,,.,,.,..,..,.,,. "."",";."""."""".",""".",.,,"""

In [91]:
compare(sherlock['punctuation'], ulysses['punctuation'])

..-.......'.........,,,.,,,.,.-'..,-,.,,...,-,,,,, ----,,.,,.:--__.,:--,!,!.,.,,,.,,,,,..--!.':--,,:.
,,,.,,,,.:,,.,,,.-,-(,.-,,,,.,,,,.,,.,,..-...;,,., ,.,...,.,,...--,,..,?,.,..--!.,!,.,,,.'.--:,.,'?..
,,,.."".",,""".",.,,.,."","",.,""",".,.,'.,,,,,",. ?,,:--?!,.--,,.--,?--?.--,'?..'.,!..,,.'.,:,-..--,
"";",,..,,,-.,,,-,,,"."",",.""",,.",..,"""""","""" .?--!.?--,.'..',....--!.,',:--...,,:--'!:.,'?,.--!
?'""!...,,.--,,,",--.""."",."-,'","...,"""."""..,. .':?..__.,,!.._!_!....--!.'.--,.''.--,.--,,,,.'...
.",."",'.".""-".".",""""""""""."".",;,"".''''''''' .....--!.,!,,.,,-.,,.,,,,,,,.....--,!..?--,..--,..
''',''''".",-,.--,.',--',,,",.""."..-''..,,.,,"',. .,.'.',.'.--,.'.--',..'...--,,...'.!...--,,!,...?.
.","."",."..',,"","",....""-"".,..,,",,.."".,.,,.- .--',.....,'.--,.!,:--.-.',.--',,?.....--!.''..,,.
,-.,,.-,,,,,.,,,,.""."",."".",.,."",.,,,.,",.","." .'..--.'.'???''.'.:,.,!,!!,,'.'.'!'!.,,',........-
"."",";."""."""".",""".",.,,""",.,.."","".,,."";". -,.'.--?..'.?,..--?.--,?.'.'.,.

In [52]:
from PIL import Image, ImageDraw
from math import ceil

In [38]:
len(sherlock['punctuation'])

19763

In [46]:
# Periods and question marks and exclamation marks are red. 
# Commas and quotation marks are green. 
# Semicolons and colons are blue. 
colours = {'.': (255, 0, 0), '?': (255, 0, 0), '!': (255, 0, 0),
           ',': (0, 255, 0), '"': (0, 255, 0), "'": (0, 255, 0),
           ':': (0, 0, 255),  ';': (0, 0, 255),
           'unknown': (128, 128, 128)}
max_x = 1000
max_y = 1000
block_size = 4
text = sherlock['punctuation']
img = Image.new('RGBA', (max_x, max_y))
draw = ImageDraw.Draw(img)
x = 0
y = 0
i = 0
# for i in range(100):
#     if text[i] in colours:
#         this_colour = colours[text[i]]
for p in text:
    if p in colours:
        this_colour = colours[p]
    else:
        this_colour = colours['unknown']
    draw.rectangle((x, y, x+block_size, y+block_size), fill=this_colour)
    x += block_size
    if x >= max_x:
        x = 0
        y += block_size
img.save('test.png')

In [79]:
# Periods and question marks and exclamation marks are red. 
# Commas and quotation marks are -green- blue. 
# Semicolons and colons are -blue- green. 
def make_image(text, block_size=4, width=1000, colours=None):
    default_colours = {'.': (255, 0, 0), '?': (255, 0, 0), '!': (255, 0, 0),
           ',': (0, 0, 255), '"': (0, 0, 255), "'": (0, 0, 255),
           ':': (0, 255, 0),  ';': (0, 255, 0),
           'unknown': (128, 128, 128)}
    if not colours:
        colours = {}
    use_colours = default_colours.copy()
    use_colours.update(colours)
    height = ceil((len(text) * block_size) / width)
    img = Image.new('RGBA', (width, height))
    draw = ImageDraw.Draw(img)
    x = 0
    y = 0
    for p in text:
        if p in use_colours:
            this_colour = use_colours[p]
        else:
            this_colour = use_colours['unknown']
        draw.rectangle((x, y, x+block_size, y+block_size), fill=this_colour)
        x += block_size
        if x >= width:
            x = 0
            y += block_size
    return img

In [80]:
i = make_image(sherlock['punctuation'])
i.save('sherlock.png')

In [81]:
i = make_image(wap['punctuation'])
i.save('wap.png')

In [82]:
i = make_image(shakespeare['punctuation'])
i.save('shakespeare.png')

In [92]:
i = make_image(ulysses['punctuation'])
i.save('ulysses.png')

In [57]:
i = make_image(wap['punctuation'], block_size=6)
i.save('wap.png')

In [84]:
i = make_image(wap['punctuation'], block_size=2, width=300, colours={'-': (255,255,255)})
i.save('wap.png')