In [2]:
from glob import glob
import json
import pandas as pd
import cufflinks as cf
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import statsmodels
import sqlite3
from collections import Counter
from matplotlib import pyplot as plt

In [3]:
jsonFiles = [open(f).read() for f in glob('results-json/*')]

In [4]:
parsedFiles = [json.loads(jsonFile) for jsonFile in jsonFiles]

In [5]:
len(parsedFiles[0])

37

In [6]:
allData = {}
for fileData in parsedFiles:
    for bookData in fileData: 
        colorData = {itemDict['colorWord']: itemDict['nMatches'] for itemDict in bookData[0]['statsList']}
        allData[bookData[0]['textName']] = colorData                                        

In [7]:
df = pd.DataFrame(allData)

In [8]:
df = df.T.sort_index()

In [9]:
df

Unnamed: 0,black,midnight,dark,blood,brown,crimson,dusk,violet,red,earth,...,terracotta,light lavender,hot green,warm pink,pale magenta,violet pink,brownish purple,bluey green,algae,macaroni and cheese
1880-DoraThorne-2374,0.000012,,0.000182,0.000005,,0.000023,,0.000007,0.000004,0.000032,...,,,,,,,,,,
1880-GreeneFerneFarm-37046,0.000102,,0.000128,0.000053,0.000071,0.000031,0.000009,0.000009,0.000084,0.000093,...,,,,,,,,,,
1880-JezebelsDaughter-3633,0.000021,0.000002,0.000030,0.000012,0.000005,,,0.000002,0.000018,0.000004,...,,,,,,,,,,
1880-RoundAboutaGreatEstate-20528,0.000114,0.000005,0.000041,0.000010,0.000077,0.000005,0.000005,0.000005,0.000067,0.000026,...,,,,,,,,,,
1880-ShakespearesInsomniaandtheCausesThereof-11990,,,0.000020,0.000020,,,,0.000020,,0.000039,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925-TheVerseBookofaHomelyWoman-3477,0.000017,,0.000052,,0.000069,0.000017,0.000017,0.000034,0.000052,0.000103,...,,,,,,,,,,
192511-ModernBritishPoetry-26785,0.000082,0.000023,0.000085,0.000056,0.000036,0.000023,0.000013,0.000007,0.000088,0.000170,...,,,0.000003,,,,,,,
192711-OliverCromwellAPlay-17091,,0.000012,0.000012,0.000058,,,,,0.000012,0.000023,...,,,,,,,,,,
1928-BrowningsShorterPoems-16376,0.000067,0.000010,0.000030,0.000070,0.000020,,0.000007,,0.000077,0.000157,...,,,,,,,,,,


In [10]:
xkcdMap = json.load(open('../data/maps/xkcd/rgb.json'))

In [11]:
xkcdMap['black']

'#000000'

In [12]:
def combineHexes(colorDict):
    """
    Proportionally mix hex colors from dict like {"ffffff": 1.0, "0000ff": 0.5 ... }
    """
    colors = sorted(colorDict.items())
    weights = sum(colorDict.values())
    if weights == 0:
        return
    red = int(sum([int(k[:2], 16)*v for k, v in colors])/weights)
    green = int(sum([int(k[2:4], 16)*v for k, v in colors])/weights)
    blue = int(sum([int(k[4:6], 16)*v for k, v in colors])/weights)
    zpad = lambda x: x if len(x)==2 else '0' + x
    return zpad(hex(red)[2:]) + zpad(hex(green)[2:]) + zpad(hex(blue)[2:])

In [13]:
for i, row in df.iterrows(): 
    data = row[row.notna()].to_dict()
    hexes = {}
    for color, prop in data.items(): 
        colorHex = xkcdMap.get(color)
        if colorHex is None: 
            continue
        hexes[colorHex[1:]] = prop
    avg = combineHexes(hexes)
    df.loc[i, 'avg'] = avg

In [14]:
df['avg']

1880-DoraThorne-2374                                  99987d
1880-GreeneFerneFarm-37046                            8b855f
1880-JezebelsDaughter-3633                            9d6c65
1880-RoundAboutaGreatEstate-20528                     9d9a60
1880-ShakespearesInsomniaandtheCausesThereof-11990    a48a42
                                                       ...  
1925-TheVerseBookofaHomelyWoman-3477                  8f7e68
192511-ModernBritishPoetry-26785                      8b7f69
192711-OliverCromwellAPlay-17091                      7f6c4f
1928-BrowningsShorterPoems-16376                      8c7259
1928-TheBrotherofDaphne-748                           8a5560
Name: avg, Length: 796, dtype: object

In [15]:
df['avg'].to_json('averages.json')

In [22]:
df['pgId'] = [fn[-1] for fn in df.index.str.split('-')]

# Metadata

In [17]:
conn = sqlite3.connect('/home/jon/Corpora/pg-text-7.db')

In [18]:
c = conn.cursor()

In [23]:
def getAuthor(textName):
    textId = textName.split('-')[-1]
    c.execute('select author from meta where id=?', [str(float(textId))])
    return c.fetchone()[0]

In [24]:
for i, row in df.iterrows(): 
    bookId = i.split('-')[-1]
    author = getAuthor(bookId)
    df.loc[i, 'author'] = author

In [32]:
dfAuthor = df.groupby('author').sum()

In [35]:
for i, row in dfAuthor.iterrows(): 
    data = row[row.notna()].to_dict()
    hexes = {}
    for color, prop in data.items(): 
        colorHex = xkcdMap.get(color)
        if colorHex is None: 
            continue
        hexes[colorHex[1:]] = prop
    avg = combineHexes(hexes)
    dfAuthor.loc[i, 'avg'] = avg

In [37]:
dfAuthor['avg']

author
                        82765e
Alexander, Mrs.         8c7d6b
Allan, Luke             5f6675
Allen, Grant            967965
Anonymous               ba7575
                         ...  
Woolf, Virginia         888369
Wren, Jenny             7c8361
Yates, Dornford         8d4f5e
Young, Francis Brett    8f896f
Zangwill, Israel        8e7761
Name: avg, Length: 262, dtype: object

In [39]:
dfAuthor['avg'].to_json('averages-author.json')