# How color is related to sentiment and object in letters

In [1]:
import pandas as pd # library for data analysis
import re
import shutil
import webcolors
import spacy
from spacy import displacy
from collections import Counter
from spacy.matcher import Matcher
from spacy.util import filter_spans 

nlp = spacy.load('en_core_web_sm')

In [2]:
letters = pd.read_csv("letter_raw.csv")
letters.head()

Unnamed: 0,id,header,content,sketch,from,to,place,date,Unnamed: 8,Unnamed: 9,...,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72
0,1,"To Theo van Gogh. The Hague, Sunday, 29 Septem...","The Hague, 29 September 1872. My dear Theo, Th...",False,Vincent van Gogh,Theo van Gogh,The Hague,29 September 1872.,,,...,,,,,,,,,,
1,2,"To Theo van Gogh. The Hague, Friday, 13 Decemb...","The Hague, 13 December 1872. My dear Theo, Tha...",False,Vincent van Gogh,Theo van Gogh,The Hague,13 December 1872.,,,...,,,,,,,,,,
2,3,"To Theo van Gogh. The Hague, mid-January 1873.","The Hague, January 1873 My dear Theo, I heard ...",False,Vincent van Gogh,Theo van Gogh,The Hague,mid-January 1873.,,,...,,,,,,,,,,
3,4,"To Theo van Gogh. The Hague, Tuesday, 28 Janua...","The Hague, 28 Jan. 1873 My dear Theo, It’s goo...",False,Vincent van Gogh,Theo van Gogh,The Hague,28 January 1873.,,,...,,,,,,,,,,
4,5,"To Theo van Gogh. The Hague, Monday, 17 March ...","The Hague, 17 March 1873 My dear Theo, It’s ti...",False,Vincent van Gogh,Theo van Gogh,The Hague,17 March 1873.,,,...,,,,,,,,,,


In [23]:
# group by year
year = []
for i, row in letters.iterrows():
    year.append(row['date'][:-1][-4:])
letters['year'] = year

TypeError: 'float' object is not subscriptable

In [20]:
letters.groupby(['year']).size()

year
 188      2
1872      2
1873     14
1874     12
1875     36
1876     36
1877     38
1878     11
1879      5
1880      6
1881     34
1882    101
1883    122
1884     59
1885     72
1886     20
1887      6
1888    151
1889    103
1890     69
iage      1
irth      1
ment      1
dtype: int64

### Construct color list

In [3]:
webcolors.css3_names_to_hex.keys()

dict_keys(['aliceblue', 'antiquewhite', 'aqua', 'aquamarine', 'azure', 'beige', 'bisque', 'black', 'blanchedalmond', 'blue', 'blueviolet', 'brown', 'burlywood', 'cadetblue', 'chartreuse', 'chocolate', 'coral', 'cornflowerblue', 'cornsilk', 'crimson', 'cyan', 'darkblue', 'darkcyan', 'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen', 'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange', 'darkorchid', 'darkred', 'darksalmon', 'darkseagreen', 'darkslateblue', 'darkslategray', 'darkslategrey', 'darkturquoise', 'darkviolet', 'deeppink', 'deepskyblue', 'dimgray', 'dimgrey', 'dodgerblue', 'firebrick', 'floralwhite', 'forestgreen', 'fuchsia', 'gainsboro', 'ghostwhite', 'gold', 'goldenrod', 'gray', 'grey', 'green', 'greenyellow', 'honeydew', 'hotpink', 'indianred', 'indigo', 'ivory', 'khaki', 'lavender', 'lavenderblush', 'lawngreen', 'lemonchiffon', 'lightblue', 'lightcoral', 'lightcyan', 'lightgoldenrodyellow', 'lightgray', 'lightgrey', 'lightgreen', 'lightpink', 'lightsalmon', 'lights

In [4]:
# construct the list manually from the webcolors and search function on the letter website
# not include words like dark/light/pale, as they are description of color
color_words = ['colour', 'tint', 'purplish', 'yellowish', 'azure', 'black',
              'blue', 'brown', 'crimson', 'gold', 'silver', 'ivory', 'indigo', 'green',
              'grey', 'orange', 'pink', 'purple', 'red', 'violet', 'white', 'yellow',
              'ochre', 'sienna', 'carmine', 'sepia', 'vermilion', 'ultramarine', 'gamboge', 'reddish', 'brownish',
              'cobalt', 'greenish'] 

### Get sentences contain keywords

In [5]:
'''
for each letter, loop over the content sentence by sentence, 
get sentences contain key words
'''
sents_w_keyword = []

for i, row in letters.iterrows():
    if(row['from'] != "Vincent van Gogh"):
        continue
    doc = nlp(row.content)
    for sentence in doc.sents:
        sentence = sentence.text
        
        for word in color_words:
            if word in sentence.lower().split():
                #print(word, sentence)
                sents_w_keyword.append((row.id, sentence))
                break # find one keyword in sentence is enough

In [6]:
sents_w_keyword[1]

(10,
 'The bell, in the grey tower entwined with black-green ivy, Was now silent.')

In [7]:
len(sents_w_keyword)

1708

In [8]:
sents_w_keyword

[(10,
  'The evening hour Slowly the toll of the angelus-bell resounded o’er the fields, As they blissfully bathed in the gold of the evening sun.'),
 (10,
  'The bell, in the grey tower entwined with black-green ivy, Was now silent.'),
 (10,
  'Hanging motionless on high were the brown Sails of the windmill; the leaves stood still and above the huts Blue clouds of peat-smoke ascended so straight from the chimneys That they, too, seemed to hang motionless in the shimmering air. ’'),
 (10,
  'Already the purple and yellow had faded to grey in the west, And in the east there had risen close by the little church the full Copper-coloured disc of the moon, in mist enshrouded, When he entered The Swan, the inn where he boarded.'),
 (11,
  'Slowly the toll of the angelus-bell resounded o’er the fields, As they blissfully bathed in the gold of the evening sun.'),
 (11,
  'The bell, in the grey tower entwined with black-green ivy, Was now silent.'),
 (11,
  'Hanging motionless on high were the 

Frequency of the color related words along time. 1881: start to paint, 1886: move to paris.

In [15]:
before_paint = 0
after_paint = 0
after_paris = 0
for sent in sents_w_keyword:
    if(sent[0]<161):
        before_paint += 1
    elif(sent[0]<551):
        after_paint += 1
    else:
        after_paris += 1

In [16]:
before_paint/161

1.3167701863354038

In [17]:
after_paint/(551-161)

1.8897435897435897

In [18]:
after_paris/(902-551)

2.1623931623931623

### NLP

Color words and the objects.

In [8]:
def takeStart(ne):
    return ne[0]
    
# noun phrase: (DET)?(ADJ)*(NOUN)*Noun
#[{"POS":"DET","OP":"?"}, {"POS":"ADJ","OP":"*"}, {"POS":"NOUN","OP":"+"}]  
matcher = Matcher(nlp.vocab)
matcher.add("NP", [[{"POS":"DET","OP":"?"}, {"POS":"ADJ","OP":"*"}, {"POS":"NOUN","OP":"+"}]])


results = [] #[(letter id, colorword, NP)...] FINAL RESULT


# for each sentence
for comp in sents_w_keyword:
    #### first, match the pattern of noun phrase
    letter_id = comp[0]
    doc = nlp(comp[1])
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    filtered = filter_spans(spans)   

    savedNP = [] #[(start, end, NP)]
    for _, start, end in matches:
        if(doc[start:end] in filtered):
            savedNP.append((start, end, doc[start:end]))

    savedNP.sort(key=takeStart)  
    
    
    i = 0
    for ent in doc:
        if ent.text in color_words:
            np_before_dist = 0 # distance to the end of the NP before color word
            np_after_dist = 0 # distance to the start of the NP after color word

            closestNP = None
            beforeNP = None
            afterNP = None
            for np in savedNP:
                # dont consider if the other color words are recognized as noun
                if(np[2].text.split(" ")[-1] in color_words):
                    continue
                # need to consider the situation when the color word is part of the NP! Then it must be an adjective!
                if(np[0]<=i and np[1]>i and ent.pos_=="ADJ"):
                    closestNP = np[2] # find the NP! break!
                    break

                before_dist = i-np[1] # distance of color word and the NP before it
                after_dist = i-np[0] # distance of color word and the NP after it
                if(before_dist>=0):
                    np_before_dist = before_dist
                    beforeNP = np[2]
                if(after_dist<0):
                    np_after_dist = after_dist 
                    afterNP = np[2]
                    break # find closest NP after the color word, for loop can be ended

            if(closestNP == None):
                # compare the distance of NP from before and after the color word
                if(abs(np_before_dist)>abs(np_after_dist)):
                    closestNP = afterNP
                else:
                    closestNP = beforeNP

            results.append((letter_id, ent, closestNP)) 
        i += 1
    

In [9]:
results

[(10, gold, the evening sun),
 (10, grey, tower),
 (10, black, tower),
 (10, green, green ivy),
 (10, brown, the windmill),
 (10, purple, None),
 (10, yellow, None),
 (10, grey, None),
 (11, gold, the evening sun),
 (11, grey, tower),
 (11, black, tower),
 (11, green, green ivy),
 (11, brown, the windmill),
 (11, purple, None),
 (11, yellow, None),
 (11, grey, None),
 (12, green, vallies),
 (12, green, vallies),
 (12, silver, Azure saints),
 (13, blue, blue wallpaper),
 (13, green, a green border),
 (13, black, black pigs),
 (14, black, grave),
 (14, grey, silk),
 (14, grey, sea),
 (30, yellow, a dull yellow sandy road),
 (30, green, huts),
 (30, brown, a grey sky),
 (30, grey, a grey sky),
 (30, white, the horizon),
 (31, brownish, brownish red houses),
 (31, red, brownish red houses),
 (31, grey, grey roofs),
 (31, white, grey roofs),
 (31, yellow, yellow doors),
 (31, white, a large white drawbridge),
 (31, white, white horses),
 (31, black, women),
 (31, white, white caps),
 (31, b

In [10]:
len(results)

3634

Color words and the sentiment.

In [11]:
def takeStart(ne):
    return ne[0]
    
# adj phrase: (ADJ)+
matcher = Matcher(nlp.vocab)
matcher.add("ADJP", [[{"POS":"ADJ","OP":"+"}]])


results = [] #[(letter id, colorword, ADJP)...] FINAL RESULT


# for each sentence
for comp in sents_w_keyword:
    #### first, match the pattern of noun phrase
    letter_id = comp[0]
    doc = nlp(comp[1])
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    filtered = filter_spans(spans)   

    savedNP = [] #[(start, end, NP)]
    for _, start, end in matches:
        if(doc[start:end] in filtered):
            savedNP.append((start, end, doc[start:end]))

    savedNP.sort(key=takeStart)  
    
    
    i = 0
    for ent in doc:
        if ent.text in color_words:
            np_before_dist = 0 # distance to the end of the NP before color word
            np_after_dist = 0 # distance to the start of the NP after color word

            closestNP = None
            beforeNP = None
            afterNP = None
            for np in savedNP:
                # dont consider if the other color words are recognized as noun
                if(np[2].text.split(" ")[-1] in color_words):
                    continue
                # need to consider the situation when the color word is part of the NP! Then it must be an adjective!
                if(np[0]<=i and np[1]>i and ent.pos_=="ADJ"):
                    closestNP = np[2] # find the NP! break!
                    break

                before_dist = i-np[1] # distance of color word and the NP before it
                after_dist = i-np[0] # distance of color word and the NP after it
                if(before_dist>=0):
                    np_before_dist = before_dist
                    beforeNP = np[2]
                if(after_dist<0):
                    np_after_dist = after_dist 
                    afterNP = np[2]
                    break # find closest NP after the color word, for loop can be ended

            if(closestNP == None):
                # compare the distance of NP from before and after the color word
                if(abs(np_before_dist)>abs(np_after_dist)):
                    closestNP = afterNP
                else:
                    closestNP = beforeNP

            results.append((letter_id, ent, closestNP)) 
        i += 1
    

In [12]:
results

[(10, gold, None),
 (10, grey, None),
 (10, black, None),
 (10, green, None),
 (10, brown, high),
 (10, purple, None),
 (10, yellow, None),
 (10, grey, None),
 (11, gold, None),
 (11, grey, None),
 (11, black, None),
 (11, green, None),
 (11, brown, high),
 (11, purple, None),
 (11, yellow, None),
 (11, grey, None),
 (12, green, unmatured),
 (12, green, cold),
 (12, silver, Azure),
 (13, blue, None),
 (13, green, None),
 (13, black, None),
 (14, black, modest),
 (14, grey, modest),
 (14, grey, great),
 (30, yellow, dull yellow sandy),
 (30, green, dull yellow sandy),
 (30, brown, bunt),
 (30, grey, None),
 (30, white, None),
 (31, brownish, old Dutch),
 (31, red, old Dutch),
 (31, grey, None),
 (31, white, None),
 (31, yellow, None),
 (31, white, None),
 (31, white, None),
 (31, black, None),
 (31, white, None),
 (31, black, None),
 (31, white, None),
 (32, black, prow),
 (32, green, accustomed),
 (32, gold, None),
 (34, blue, dark),
 (42, black, stately old Dutch),
 (42, white, statel

### Testing

In [14]:
# testing
results = []
sent = '''
In the foreground a pond, next to which 3 cows – a white one, a black one and a red one – lie in the grass;
'''
doc = nlp(sent)
matches = matcher(doc)
spans = [doc[start:end] for _, start, end in matches]
filtered = filter_spans(spans)   

savedNP = [] #[(start, end, NP)]
for _, start, end in matches:
    if(doc[start:end] in filtered):
        savedNP.append((start, end, doc[start:end]))

savedNP.sort(key=takeStart)  


i = 0
for ent in doc:
    if ent.text in color_words:
        np_before_dist = 0 # distance to the end of the NP before color word
        np_after_dist = 0 # distance to the start of the NP after color word

        closestNP = None
        beforeNP = None
        afterNP = None
        for np in savedNP:
            # dont consider if the other color words are recognized as noun
            if(np[2].text.split(" ")[-1] in color_words):
                continue
            # need to consider the situation when the color word is part of the NP! Then it must be an adjective!
            if(np[0]<=i and np[1]>i and ent.pos_=="ADJ"):
                closestNP = np[2] # find the NP! break!
                break

            before_dist = i-np[1] # distance of color word and the NP before it
            after_dist = i-np[0] # distance of color word and the NP after it
            if(before_dist>=0):
                np_before_dist = before_dist
                beforeNP = np[2]
            if(after_dist<0):
                np_after_dist = after_dist 
                afterNP = np[2]
                break # find closest NP after the color word, for loop can be ended

        if(closestNP == None):
            print("hiii, keyword is ", ent.text)
            print("the index of the keyword is ", i)
            print(np_before_dist, beforeNP)
            print(np_after_dist, afterNP)
            # compare the distance of NP from before and after the color word
            if(abs(np_before_dist)>abs(np_after_dist)):
                print("the noun after the word is chosen!")
                closestNP = afterNP
            else:
                print("the noun before the word is chosen!")
                closestNP = beforeNP

        results.append((ent, closestNP)) 
    i += 1

hiii, keyword is  white
the index of the keyword is  14
2 cows
-3 a black one
the noun before the word is chosen!
hiii, keyword is  red
the index of the keyword is  22
2 a black one
-3 lie
the noun before the word is chosen!


In [15]:
sent

'\nIn the foreground a pond, next to which 3 cows – a white one, a black one and a red one – lie in the grass;\n'

In [21]:
for ent in doc:
    print(ent, ent.pos_)


 SPACE
In ADP
the DET
foreground NOUN
a DET
pond NOUN
, PUNCT
next ADV
to ADP
which PRON
3 NUM
cows NOUN
– PUNCT
a DET
white ADJ
one NUM
, PUNCT
a DET
black ADJ
one NOUN
and CCONJ
a DET
red ADJ
one NUM
– PUNCT
lie NOUN
in ADP
the DET
grass NOUN
; PUNCT

 SPACE


In [16]:
savedNP

[(2, 4, the foreground),
 (4, 6, a pond),
 (11, 12, cows),
 (17, 20, a black one),
 (25, 26, lie),
 (27, 29, the grass)]

In [17]:
results

[(white, cows), (black, a black one), (red, a black one)]