forked from e-mckinnie/WEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wefat.py
80 lines (62 loc) · 2.5 KB
/
wefat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import numpy as np
from scipy.stats import norm
from statsmodels.distributions.empirical_distribution import ECDF
from wordembeddingtest import WordEmbeddingTest
class WEFAT(WordEmbeddingTest):
def __init__(self, W, A, B):
self.W = W
self.A = A
self.B = B
# Return all calculated effect sizes
def all_effect_sizes(self):
effect_sizes = {}
for w in self.W:
effect_size = self.effect_size(self.W[w])
effect_sizes[w] = effect_size
return effect_sizes
# Calculate s(w, A, B)
def effect_size(self, w):
a_cos = np.array([self._cos(w, a) for a in self.A])
b_cos = np.array([self._cos(w, b) for b in self.B])
return (np.mean(a_cos) - np.mean(b_cos)) / np.std(np.concatenate((a_cos, b_cos)))
# Return all calculated p_values
def all_p_values(self, iterations, distribution_type):
p_values = {}
for w in self.W:
p_value = self.p_value(self.W[w], iterations, distribution_type)
p_values[w] = p_value
return p_values
# Get p-value
# {(A_i, B_i)}_i is all partitions of A union B into two sets of equal size
# p-value = Pr_i[s(w, A_i, B_i) > s(w, A, B)]
def p_value(self, w, iterations, distribution_type):
test_statistic = self._get_test_statistic(w)
null_distribution = self._null_distribution(w, iterations)
if distribution_type == 'normal':
mu, std = norm.fit(null_distribution)
cdf = norm.cdf(test_statistic, mu, std)
return 1 - cdf
elif distribution_type == 'empirical':
ecdf = ECDF(null_distribution)
return 1 - ecdf(test_statistic)
# Get test statistic
def _get_test_statistic(self, w):
a_cos = np.array([self._cos(w, a) for a in self.A])
b_cos = np.array([self._cos(w, b) for b in self.B])
return np.mean(a_cos) - np.mean(b_cos)
# Get null distribution
def _null_distribution(self, w, iterations):
AB = np.concatenate((self.A, self.B))
set_size = int(len(AB) / 2)
distribution = np.zeros(iterations)
for i in range(iterations):
np.random.shuffle(AB)
A = AB[0:set_size]
B = AB[set_size:]
a_cos = np.array([self._cos(w, a) for a in A])
b_cos = np.array([self._cos(w, b) for b in B])
distribution[i] = np.mean(a_cos) - np.mean(b_cos)
return distribution
# Get name of test
def tostr(self):
return 'WEFAT'