Import libraries


In [1]:
from wcs_helper_functions import *

In [2]:
import numpy as np
from scipy import stats, spatial
from random import random
import copy
%matplotlib inline

Read data

In [3]:
coordIndex, indexCoord = readChipData('./WCS_data_core/chip.txt')
cielabCoord = readClabData('./WCS_data_core/cnum-vhcm-lab-new.txt')
namingData = readNamingData('./WCS_data_core/term.txt')
fociData = readFociData('./WCS_data_core/foci-exp.txt')

namingDataBK = readNamingData('./bk_data/BK-term.txt')
fociDataBK = readFociDataBK('./bk_data/BK-foci.txt')

The getFoci function
Returns a dictionary where (lang, term) = foci in cielab space

In [4]:
def getFoci(fociData):
    foci = {}
    for lang, speakers in fociData.items():
        bestExamples = {}
        for speaker, terms in speakers.items():
            for term, chips in terms.items():
                if term not in bestExamples:
                    bestExamples[term] = {}
                for chip in chips:
                    if chip not in bestExamples[term]:
                        bestExamples[term][chip] = 0
                    bestExamples[term][chip] += 1
        for term, chips in bestExamples.items():
            maxCount = 0
            bestChips = []
            for chip, count in chips.items():
                if count == maxCount:
                    bestChips.append(chip)
                elif count > maxCount:
                    maxCount = count
                    bestChips = [chip]
            chosenChip = np.random.choice(bestChips)
            chosenChipIndex = coordIndex[chosenChip[0] + chosenChip[2:]]
            foci[(lang, term)] = np.array(
                cielabCoord[chosenChipIndex]).astype(np.float)
    return foci

In [5]:
def getCentroids(namingData):
    centroids = {}
    for lang, speakers in namingData.items():
        speakerCentroids = {}
        for speaker, chips in speakers.items():
            terms = {}
            # Get all cielab coordinates for the speaker
            for chip, term in chips.items():
                coord = cielabCoord[chip]
                if term not in terms:
                    terms[term] = []
                terms[term].append(coord)

            # Generate mean
            for term, coords in terms.items():
                if term not in speakerCentroids:
                    speakerCentroids[term] = []
                speakerCentroids[term].append(
                    np.array(coords).astype(np.float).mean(axis=0))
        # Generate mean for each term and lang
        for term, l in speakerCentroids.items():
            centroids[(lang, term)] = np.array(l).mean(axis=0)
    return centroids

In [6]:
def getLightnessRow(centroids):
    lightness = {}
    for term, colour in centroids.items():
        L, a, b = colour.tolist()
        closestLightness, munsellRow = float('inf'), None
        for i, munsell in indexCoord.items():
            cielab = cielabCoord[i]
            if abs(L - float(cielab[0])) < closestLightness:
                closestLightness, munsellRow = abs(L - float(cielab[0])), munsell[0]
        
        # Set the row of the term
        lightness[term] = ((L, a, b), munsellRow)
    return lightness

In [7]:
def getChips(lightness):
    # Get closest colour chip for each term in that row
    chips = {}
    for term, (colour, row) in lightness.items():
        closest, chip = float('inf'), None
        for i, munsell in indexCoord.items():
            if munsell[0] == row: # and munsell[1] != "0":
                cielab = np.array(cielabCoord[i]).astype(np.float)
                dist = spatial.distance.cosine(cielab[1:], colour[1:])
                if dist < closest:
                    closest, chip = dist, i
#         neutralChip = coordIndex[row + "0"]
#         neutralCielab = np.array(cielabCoord[neutralChip]).astype(np.float)
#         closestCielab = np.array(cielabCoord[chip]).astype(np.float) if chip is not None else None
#         if chip is None or abs(np.linalg.norm(colour[1:]) - np.linalg.norm(neutralCielab[1:])) < abs(np.linalg.norm(closestCielab[1:]) - np.linalg.norm(colour[1:])):
#             chip = neutralChip

        chips[term] = np.array(cielabCoord[chip]).astype(np.float)
    return chips

In [8]:
centroids = {
    "bk": getChips(getLightnessRow(getCentroids(namingDataBK))),
    "wcs": getChips(getLightnessRow(getCentroids(namingData)))
}

focus = {
    "bk": {term: colour for term, colour in getFoci(fociDataBK).items() if term in centroids["bk"]},
    "wcs": {term: colour for term, colour in getFoci(fociData).items() if term in centroids["wcs"]}
}

# Refilter
centroids = {
    "bk": {term: colour for term, colour in centroids["bk"].items() if term in focus["bk"]},
    "wcs": {term: colour for term, colour in centroids["wcs"].items() if term in focus["wcs"]}
}

In [9]:
# Foci Separation Calculation
fs = np.zeros(110)
for (language, term), cielab in focus["wcs"].items():
    bkDist = np.ones(20) * np.inf
    for (languageBK, termBK), cielabBK in focus["bk"].items():
        dist = spatial.distance.euclidean(cielab, cielabBK)
        if dist < bkDist[languageBK - 1]:
            bkDist[languageBK - 1] = dist
    fs[language - 1] += bkDist.sum()
np.mean(fs)

5488.847837309833

In [10]:
# Centroid Separation Calculation
cs = np.zeros(110)
for (language, term), cielab in centroids["wcs"].items():
    bkDist = np.ones(20) * np.inf
    for (languageBK, termBK), cielabBK in centroids["bk"].items():
        dist = spatial.distance.euclidean(cielab, cielabBK)
        if dist < bkDist[languageBK - 1]:
            bkDist[languageBK - 1] = dist
    cs[language - 1] += bkDist.sum()
np.mean(cs)

6836.337467670284

In [11]:
stats.ttest_rel(fs, cs).pvalue

1.135431450007425e-26