In [1]:
import spacy
import csv
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_lg")

In [11]:
import statistics
import collections
import tqdm
all_uuas = []
matches = collections.defaultdict(lambda: [0, 0])

writer_100 = csv.DictWriter(open(f"devset_new/uuas_1.csv", "w"), fieldnames=['Sentence ID', 'Original Sentence', 'New Sentence'])
writer_100.writeheader()

writer_95 = csv.DictWriter(open(f"devset_new/uuas_95.csv", "w"), fieldnames=['Sentence ID', 'Original Sentence', 'New Sentence'])
writer_95.writeheader()

writer_l90 = csv.DictWriter(open(f"devset_new/uuas_l90.csv", "w"), fieldnames=['Sentence ID', 'Original Sentence', 'New Sentence'])
writer_l90.writeheader()

# for i in tqdm.trange(20):
with open(f'prodset_clean.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in tqdm.tqdm(reader, total=10000):
        orig = row['Original Sentence']
        new = row['New Sentence']
        
        doc1 = nlp(orig)
        doc2 = nlp(new)


        a = [i.head.i for i in doc1[:-1]]
        b = [i.head.i for i in doc2[:-1]]
        if len(a) != len(b):
            continue

        
        uuas = sum(1 for i in range(len(a)) if a[i] == b[i]) / len(a)

        for idx, (i, j) in enumerate(zip(doc1[:-1], doc2[:-1])):
            if a[idx] == b[idx] and i.dep_ == j.dep_:
                matches[i.dep_][0] += 1
            matches[i.dep_][1] += 1

        if uuas == 1:
            writer_100.writerow(row)
        elif 0.5 <= uuas and uuas < 1:
            writer_95.writerow(row)
        if uuas < 0.9:
            writer_l90.writerow(row)

        all_uuas.append(uuas)

print(statistics.mean(all_uuas), len(all_uuas), sum(1 for i in all_uuas if i == 1), sum(1 for i in all_uuas if 0.90 <= i and i < 1))

100%|████████████████████████████████████| 10000/10000 [00:48<00:00, 205.25it/s]

0.8478797361180124 4476 1381 942





In [5]:
for k, v in sorted(matches.items(), key = lambda a: -(a[1][0] + a[1][1])):
    print(f"{k.ljust(10)}: {round(v[0] / v[1] * 100, 2)}% ({v[0]}/{v[1]})")

pobj      : 86.75% (72/83)
prep      : 72.41% (63/87)
det       : 91.03% (71/78)
nsubj     : 87.84% (65/74)
compound  : 86.44% (51/59)
punct     : 66.67% (38/57)
ROOT      : 84.0% (42/50)
aux       : 92.11% (35/38)
advmod    : 71.79% (28/39)
dobj      : 65.0% (26/40)
amod      : 64.52% (20/31)
cc        : 62.5% (15/24)
nummod    : 94.44% (17/18)
conj      : 61.9% (13/21)
ccomp     : 76.47% (13/17)
poss      : 87.5% (14/16)
npadvmod  : 50.0% (9/18)
mark      : 100.0% (11/11)
case      : 90.91% (10/11)
nmod      : 58.33% (7/12)
advcl     : 60.0% (6/10)
attr      : 20.0% (2/10)
auxpass   : 71.43% (5/7)
nsubjpass : 83.33% (5/6)
neg       : 66.67% (4/6)
xcomp     : 66.67% (4/6)
acl       : 33.33% (2/6)
relcl     : 60.0% (3/5)
quantmod  : 75.0% (3/4)
pcomp     : 66.67% (2/3)
acomp     : 66.67% (2/3)
agent     : 100.0% (2/2)
appos     : 100.0% (2/2)
dative    : 0.0% (0/2)
expl      : 0.0% (0/1)
intj      : 0.0% (0/1)
parataxis : 0.0% (0/1)
prt       : 0.0% (0/1)


In [24]:
from tabulate import tabulate
buckets = []

for bucket in range(100, -1, -5):
    buckets.append(f"{bucket-5}-{bucket}%")
#     n = sum(1 for i in all_uuas if bucket - 5 <= i * 100 <= bucket)
#     uuass.append(f"{round(n / len(all_uuas) * 100, 2)}%")

#     matches = collections.defaultdict(lambda: [0, 0])
#     for idx, (i, j) in enumerate(zip(doc1[:-1], doc2[:-1])):
#             if a[idx] == b[idx] and i.dep_ == j.dep_:
#                 matches[i.dep_][0] += 1
#             matches[i.dep_][1] += 1
uuass = [0 for i in buckets]
matches = [collections.defaultdict(lambda: [0, 0]) for i in buckets]
for i in tqdm.trange(8):
    with open(f'devset_new/orig_{i}_claude.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        
        for row in reader:
            orig = row['Original Sentence']
            new = row['New Sentence']
            
            doc1 = nlp(orig)
            doc2 = nlp(new)
    
    
            a = [i.head.i for i in doc1[:-1]]
            b = [i.head.i for i in doc2[:-1]]
            if len(a) != len(b):
                continue
    
            
            uuas = sum(1 for i in range(len(a)) if a[i] == b[i]) / len(a)
            
            bucket = int(20 - ((uuas - 0.0001) * 100 // 5) - 1)
            # print(uuas, bucket)
    
            uuass[bucket] += 1

            if bucket == 20:
                print(orig, new)
            
            
            for idx, (i, j) in enumerate(zip(doc1[:-1], doc2[:-1])):
                if a[idx] == b[idx] and i.dep_ == j.dep_:
                    matches[bucket][i.dep_][0] += 1
                matches[bucket][i.dep_][1] += 1

uuass = [f"{round(i / sum(uuass) * 100, 2)}%" for i in uuass]
print(tabulate(zip(buckets, uuass), tablefmt="pretty", headers=["Bucket", "Proportion"]))


 12%|█████▋                                       | 1/8 [00:00<00:02,  2.98it/s]

That may soon change. That may fast back.


100%|█████████████████████████████████████████████| 8/8 [00:02<00:00,  3.18it/s]

+---------+------------+
| Bucket  | Proportion |
+---------+------------+
| 95-100% |   30.95%   |
| 90-95%  |   14.88%   |
| 85-90%  |   12.5%    |
| 80-85%  |   11.31%   |
| 75-80%  |   7.74%    |
| 70-75%  |   5.95%    |
| 65-70%  |   6.55%    |
| 60-65%  |   2.98%    |
| 55-60%  |    0.6%    |
| 50-55%  |   2.38%    |
| 45-50%  |    0.6%    |
| 40-45%  |    0.0%    |
| 35-40%  |    0.0%    |
| 30-35%  |   1.79%    |
| 25-30%  |    0.6%    |
| 20-25%  |    0.6%    |
| 15-20%  |    0.0%    |
| 10-15%  |    0.0%    |
|  5-10%  |    0.0%    |
|  0-5%   |    0.0%    |
|  -5-0%  |    0.6%    |
+---------+------------+





In [17]:
x = [i.split(":")[0].strip() for i in """pobj      : 86.75% (72/83)
prep      : 72.41% (63/87)
det       : 91.03% (71/78)
nsubj     : 87.84% (65/74)
compound  : 86.44% (51/59)
punct     : 66.67% (38/57)
ROOT      : 84.0% (42/50)
aux       : 92.11% (35/38)
advmod    : 71.79% (28/39)
dobj      : 65.0% (26/40)
amod      : 64.52% (20/31)
cc        : 62.5% (15/24)
nummod    : 94.44% (17/18)
conj      : 61.9% (13/21)
ccomp     : 76.47% (13/17)
poss      : 87.5% (14/16)
npadvmod  : 50.0% (9/18)
mark      : 100.0% (11/11)
case      : 90.91% (10/11)
nmod      : 58.33% (7/12)
advcl     : 60.0% (6/10)
attr      : 20.0% (2/10)
auxpass   : 71.43% (5/7)
nsubjpass : 83.33% (5/6)
neg       : 66.67% (4/6)
xcomp     : 66.67% (4/6)
acl       : 33.33% (2/6)
relcl     : 60.0% (3/5)
quantmod  : 75.0% (3/4)
pcomp     : 66.67% (2/3)
acomp     : 66.67% (2/3)
agent     : 100.0% (2/2)
appos     : 100.0% (2/2)
dative    : 0.0% (0/2)
expl      : 0.0% (0/1)
intj      : 0.0% (0/1)
parataxis : 0.0% (0/1)
prt       : 0.0% (0/1)""".split("\n")]

In [18]:
deps = collections.defaultdict(lambda: [0 for i in buckets])
for i, m in enumerate(matches):
    for k, v in m.items():
        deps[k][i] = f"{round(v[0] / v[1] * 100, 2)}%"

In [20]:
# o = [list(deps.keys()).index(i) for i in x]
v = list(deps.values())[:10]#[list(deps.values())[i] for i in o][:10]
k = list(deps.keys())[:10]#[list(deps.keys())[i] for i in o][:10]

print(tabulate(zip(buckets, uuass, *v), tablefmt="pretty", headers=["Bucket", "Proportion"] + k))

+---------+------------+--------+--------+--------+--------+--------+--------+----------+--------+--------+--------+
| Bucket  | Proportion | nsubj  |  prep  |  det   |  pobj  |  aux   |  ROOT  | npadvmod | advmod |  poss  |  case  |
+---------+------------+--------+--------+--------+--------+--------+--------+----------+--------+--------+--------+
| 95-100% |   32.56%   | 98.04% | 96.36% | 98.21% | 98.08% | 97.3%  | 100.0% |  54.55%  | 94.74% | 100.0% | 100.0% |
| 90-95%  |   14.73%   | 94.59% | 83.78% | 100.0% | 97.06% | 95.0%  | 100.0% |  42.86%  | 75.0%  | 100.0% | 100.0% |
| 85-90%  |   11.63%   | 100.0% | 84.0%  | 100.0% | 96.3%  | 100.0% | 100.0% |  75.0%   | 57.14% | 80.0%  | 100.0% |
| 80-85%  |    9.3%    | 86.36% | 80.0%  | 100.0% | 86.21% | 100.0% | 83.33% |  27.27%  | 83.33% | 100.0% | 100.0% |
| 75-80%  |   7.75%    | 87.5%  | 81.48% | 87.5%  | 86.36% | 57.14% | 100.0% |  14.29%  | 66.67% | 100.0% | 100.0% |
| 70-75%  |    3.1%    | 50.0%  | 66.67% | 100.0% | 100.0% | 50.