forked from e-mckinnie/WEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
weat.py
72 lines (54 loc) · 2.15 KB
/
weat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
from scipy.stats import norm
from statsmodels.distributions.empirical_distribution import ECDF
from wordembeddingtest import WordEmbeddingTest
class WEAT(WordEmbeddingTest):
def __init__(self, X, Y, A, B):
self.X = X
self.Y = Y
self.A = A
self.B = B
# Calculate effect size
def effect_size(self):
x_s = np.array([self.s(x) for x in self.X])
y_s = np.array([self.s(y) for y in self.Y])
return (np.mean(x_s) - np.mean(y_s)) / np.std(np.concatenate((x_s, y_s)))
# Calculate s(w, A, B)
def s(self, w):
a_cos = np.array([self._cos(w, a) for a in self.A])
b_cos = np.array([self._cos(w, b) for b in self.B])
return (np.mean(a_cos) - np.mean(b_cos))
# Get name of test
def tostr(self):
return 'WEAT'
# Get p-value
# {(X_i, Y_i)}_i is all partitions of X union Y into two sets of equal size
# p-value = Pr_i[s(X_i, Y_i, A, B) > s(X, Y, A, B)]
def p_value(self, iterations, distribution_type):
test_statistic = self._get_test_statistic()
null_distribution = self._null_distribution(iterations)
if distribution_type == 'normal':
mu, std = norm.fit(null_distribution)
cdf = norm.cdf(test_statistic, mu, std)
return 1 - cdf
elif distribution_type == 'empirical':
ecdf = ECDF(null_distribution)
return 1 - ecdf(test_statistic)
# Get test_statistic
def _get_test_statistic(self):
x_s = np.array([self.s(x) for x in self.X])
y_s = np.array([self.s(y) for y in self.Y])
return np.mean(x_s) - np.mean(y_s)
# Get null distribution
def _null_distribution(self, iterations):
XY = np.concatenate((self.X, self.Y))
set_size = int(len(XY) / 2)
distribution = np.zeros(iterations)
for i in range(iterations):
np.random.shuffle(XY)
X = XY[0:set_size]
Y = XY[set_size:]
x_s = np.array([self.s(x) for x in X])
y_s = np.array([self.s(y) for y in Y])
distribution[i] = np.mean(x_s) - np.mean(y_s)
return distribution