In [83]:
from glob import glob
import json
import pandas as pd
import cufflinks as cf
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import statsmodels
import sqlite3
from collections import Counter
from matplotlib import pyplot as plt

In [84]:
jsonFiles = [open(f).read() for f in glob('results-json/*')]

In [85]:
parsedFiles = [json.loads(jsonFile) for jsonFile in jsonFiles]

In [86]:
len(parsedFiles[0])

36

In [87]:
allData = {}
for fileData in parsedFiles:
    for bookData in fileData: 
        colorData = {itemDict['colorWord']: itemDict['nMatches'] for itemDict in bookData[0]['statsList']}
        allData[bookData[0]['textName']] = colorData                                        

In [88]:
df = pd.DataFrame(allData)

In [89]:
df = df.T.sort_index()

In [90]:
df

Unnamed: 0,black,midnight,dark,royal,chestnut,brown,crimson,forest,blue,russet,...,ultramarine blue,light salmon,sap green,muddy yellow,pastel blue,light lilac,very dark green,brown yellow,dusky rose,yellow tan
1880-DoraThorne-2374,0.000012,,0.000182,0.000002,0.000004,,0.000023,0.000005,0.000027,,...,,,,,,,,,,
1880-GreeneFerneFarm-37046,0.000102,,0.000128,0.000004,0.000031,0.000071,0.000031,0.000013,0.000057,,...,,,,,,,,,,
1880-JezebelsDaughter-3633,0.000021,0.000002,0.000030,0.000007,,0.000005,,,0.000039,,...,,,,,,,,,,
1880-RoundAboutaGreatEstate-20528,0.000114,0.000005,0.000041,0.000005,0.000015,0.000077,0.000005,0.000005,0.000052,,...,,,,,,,,,,
1880-ShakespearesInsomniaandtheCausesThereof-11990,,,0.000020,0.000020,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1925-TheVerseBookofaHomelyWoman-3477,0.000017,,0.000052,0.000034,,0.000069,0.000017,0.000034,0.000052,,...,,,,,,,,,,
192511-ModernBritishPoetry-26785,0.000082,0.000023,0.000085,0.000020,0.000003,0.000036,0.000023,0.000020,0.000095,,...,,,,,,,,,,
192711-OliverCromwellAPlay-17091,,0.000012,0.000012,0.000012,,,,,,,...,,,,,,,,,,
1928-BrowningsShorterPoems-16376,0.000067,0.000010,0.000030,,,0.000020,,,0.000083,,...,,,,,,,,,,


In [91]:
xkcdMap = json.load(open('../data/maps/xkcd/rgb.json'))

In [92]:
xkcdMap['black']

'#000000'

In [93]:
def combineHexes(colorDict):
    """
    Proportionally mix hex colors from dict like {"ffffff": 1.0, "0000ff": 0.5 ... }
    """
    colors = sorted(colorDict.items())
    weights = sum(colorDict.values())
    if weights == 0:
        return
    red = int(sum([int(k[:2], 16)*v for k, v in colors])/weights)
    green = int(sum([int(k[2:4], 16)*v for k, v in colors])/weights)
    blue = int(sum([int(k[4:6], 16)*v for k, v in colors])/weights)
    zpad = lambda x: x if len(x)==2 else '0' + x
    return zpad(hex(red)[2:]) + zpad(hex(green)[2:]) + zpad(hex(blue)[2:])

In [96]:
for i, row in df.iterrows(): 
    data = row[row.notna()].to_dict()
    hexes = {}
    for color, prop in data.items(): 
        colorHex = xkcdMap.get(color)
        if colorHex is None: 
            continue
        hexes[colorHex[1:]] = prop
    avg = combineHexes(hexes)
    df.loc[i, 'avg'] = avg

In [97]:
df['avg']

1880-DoraThorne-2374                                  99987d
1880-GreeneFerneFarm-37046                            8b855f
1880-JezebelsDaughter-3633                            9d6c65
1880-RoundAboutaGreatEstate-20528                     9d9a60
1880-ShakespearesInsomniaandtheCausesThereof-11990    a48a42
                                                       ...  
1925-TheVerseBookofaHomelyWoman-3477                  8f7e68
192511-ModernBritishPoetry-26785                      8b7f69
192711-OliverCromwellAPlay-17091                      7f6c4f
1928-BrowningsShorterPoems-16376                      8c7259
1928-TheBrotherofDaphne-748                           8a5560
Name: avg, Length: 796, dtype: object

In [98]:
df['avg'].to_json('averages.json')

In [99]:
df['pgId'] = df.index.str.split('-')[-1]

ValueError: Length of values does not match length of index

# Metadata

In [100]:
conn = sqlite3.connect('/home/jon/Corpora/pg-text-7.db')

In [101]:
c = conn.cursor()

In [102]:
def getAuthor(textName):
    textId = textName.split('-')[-1]
    c.execute('select author from meta where id=?', [str(float(textId))])
    return c.fetchone()[0]

In [104]:
for i, row in df.iterrows(): 
    bookId = i.split('-')[-1]
    author = getAuthor(bookId)
    df.loc[i, 'author'] = author

In [None]:
df