In [28]:
import spacy
from collections import Counter
import re

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
nlp.max_length=6000000

In [82]:
nutallRaw = open('../data/dict/pg12342.txt').read()

In [62]:
baseColors = ['black', 'grey', 'brown', 'white', 'red', 'orange', 'yellow', 'green', 'blue']

# Dictionaries

## The Nutall Encyclopedia

Title: The Nuttall Encyclopaedia
       Being a Concise and Comprehensive Dictionary of General Knowledge

Release Date: May 14, 2004 [EBook #12342]


In [92]:
def processNutall(text): 
    paras = text.split('\n\n')
    definitions = {}
    for para in paras: 
        para = para.replace('\n', ' ')
        result = re.match('([A-Z `,.]{2,}), (.*)', para) 
        if result is not None:
            word = result.group(1).strip()
            definition = result.group(2)
            definitions[word] = definition
    return definitions

In [93]:
nutall = processNutall(nutallRaw)

In [99]:
def vectorizeDict(nutallDict): 
    """ 
    Given a dictionary of word: definition, 
    score words for color content by giving a point whenever a color word
    appears in a word's definition. 
    """ 
    colorWords = {}
    for word, definition in nutallDict.items(): 
        #print(word, definition)
        defDoc = nlp(definition)
        for w in defDoc: 
            wStr = str(w.lemma_)
            if wStr in baseColors: 
                #print(wStr)
                if wStr in colorWords: 
                    colorWords[wStr].append(word)
                else: 
                    colorWords[wStr] = [word]
    return colorWords

In [100]:
vectorizeDict(nutall)

{'green': ['AFFRE',
  'BYRON, GEORGE GORDON, SIXTH LORD',
  'CHLOROPHYLL',
  'CHLOROSIS',
  'DESMOULINS, CAMILLE',
  'DESMOULINS, CAMILLE',
  'DESMOULINS, CAMILLE',
  'DESMOULINS, CAMILLE',
  'EMERALD',
  'EMIR',
  'FERGUSSON, ROBERT',
  'GARNET',
  'JACKAROO',
  'MAKRIZI',
  'RIBBONISM',
  'SILAGE',
  'SILAGE',
  'STARS, THE',
  'THALLIUM'],
 'red': ['ALBINOS',
  'AMERICAN INDIANS',
  'BART',
  'BERRYER, PIERRE ANTOINE',
  'CHICA',
  'CHROMATICS',
  'ECZEMA',
  'ELLORA',
  'ERYTHEMA',
  'FARINATA',
  'GARDE NATIONALE',
  'GARNET',
  'GERYON',
  'HOGG, JAMES',
  'KRAKATAO',
  'LACHRYMA CHRISTI',
  'LEVELLERS',
  'MAURY',
  'RED SEA',
  'ROBESPIERRE, MAXIMILIEN',
  'ROSES, WARS OF THE',
  'RUBRICS',
  'RUBRICS',
  'SOUTHEY, ROBERT',
  'STARS, THE',
  'TALUS',
  'TEMPLARS',
  'TRICOLOUR',
  'WARS OF THE ROSES',
  'WOOLSACK'],
 'white': ['ALBION',
  'ALBUMEN',
  'ANDREW, ST.',
  'ANTIMONY',
  'A`PIS',
  'AURO`RA',
  'BERNARD, ST.',
  'CAGNOLA, LUIGI, MARQUIS OF',
  'CAMERA LUCIDA',
  'CHI

In [102]:
texts = ['../data/text/lighthouse.md', '../data/text/pride.txt', '../data/text/dalloway.txt']

In [134]:
%%timeit -n1 -r1
n = 10
colorNeighbors = {}
posList = ['NN', 'JJ', 'JJR', 'JJR', 'NNS']

for text in texts: 
    rawText = open(text).read()
    if len(rawText) > 4000000: 
        textParts = [rawText[i:i + n] for i in range(0, len(rawText), n)]
    else: 
        textParts = [rawText]
    for textPart in textParts:
        textDoc = nlp(textPart)
        for w in textDoc: 
            if str(w.lemma_) in baseColors: 
                neighbors = []
                for wordIndex in range(1, n+1):
                    score = 1 / wordIndex 
                    try: 
                        backWord = textDoc[w.i - wordIndex]
                        backWordStr = str(backWord.lemma_)
                        if backWord.tag_ in posList and backWord.is_alpha:
                            neighbors.append((backWordStr, score))
                        frontWord = textDoc[w.i + wordIndex]
                        frontWordStr = str(frontWord.lemma_)
                        if frontWord.tag_ in posList and frontWord.is_alpha:
                            neighbors.append((frontWordStr, score))
                    except IndexError:
                        continue
            else:
                continue
            if len(neighbors) > 0:
                wStr = str(w.lemma_)
                if wStr in colorNeighbors: 
                    colorNeighbors[wStr] += neighbors
                else: 
                    colorNeighbors[wStr] = neighbors

16.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [135]:
colorNeighborsSums = {}
for color, scoreList in colorNeighbors.items():
    combinedScores = {}
    for wordAndScore in scoreList:
        word, score = wordAndScore
        if word in combinedScores:
            combinedScores[word] += score
        else: 
            combinedScores[word] = score
    colorNeighborsSums[color] = combinedScores

In [136]:
colorNeighborsSumsSorted = {color: {k: v for k, v in sorted(stats.items(), key=lambda item: item[1], reverse=True)} for color, stats in colorNeighborsSums.items()}
colorNeighborsSumsSorted

{'blue': {'eye': 6.2,
  'hydrangea': 2.725,
  'light': 2.4166666666666665,
  'smoke': 2.333333333333333,
  'green': 2.2916666666666665,
  'bay': 1.4777777777777779,
  'black': 1.2833333333333332,
  'little': 1.225,
  'evening': 1.211111111111111,
  'high': 1.2,
  'bright': 1.1666666666666667,
  'dark': 1.1,
  'grey': 1.0833333333333333,
  'bar': 1.0,
  'censer': 1.0,
  'coat': 1.0,
  'envelope': 1.0,
  'eyed': 1.0,
  'fierce': 1.0,
  'frail': 1.0,
  'haze': 1.0,
  'morning': 1.0,
  'paint': 1.0,
  'pale': 1.0,
  'petal': 1.0,
  'ribbon': 1.0,
  'shape': 1.0,
  'sofa': 1.0,
  'water': 0.8666666666666667,
  'distance': 0.85,
  'red': 0.8333333333333333,
  'wave': 0.7261904761904762,
  'sky': 0.7083333333333333,
  'white': 0.6761904761904762,
  'cloud': 0.6666666666666666,
  'air': 0.6428571428571428,
  'clear': 0.6428571428571428,
  'leave': 0.6416666666666666,
  'window': 0.6166666666666667,
  'moment': 0.6,
  'plume': 0.6,
  'line': 0.5833333333333333,
  'side': 0.5333333333333333,
  '