The goal here is to look for the patterns holding over claims and warrants in the `neg` set that explain the performance. This should hopefully inform how to resolve those patterns. Although it is likely there are a collection of smallish heuristics and that the only way to eliminate them is to make all claims appear with each warrant they are compared to once - i.e. once correctly with the warrant, and once with the alternative. I'm not sure how to effect that.

In [37]:
from nltk import tokenize
from scipy import stats

from arct import data

In [15]:
train_orig = data.load('train-original')
train_neg = data.load('train-adv-negated')
train_swap = data.load('train-adv-swapped')

In [36]:
preds_orig = data.preds('bert_base')
preds_neg = data.preds('bert_base_adv_neg')

In [54]:
class Heuristic:
    
    def corr(self, df, preds):
        appl = []
        prod = []
        prob = []
        for _, x in df.iterrows():
            _appl, _prod = self.assess(x)
            _prob = preds[preds['id'] == x['#id']].mean()['correct']
            appl.append(_appl)
            prod.append(_prod)            
            prob.append(_prob)
        c, p = stats.pearsonr(prod, prob)
        print('Correlation with productive cases: %4.3f (%4.3f)' % (c, p))
    
    @staticmethod
    def has_not(sent):
        toks = tokenize.word_tokenize(sent)
        return 'not' in toks or "n't" in toks
    
    def prod_cov(self, df):
        n = len(df)
        applicable = 0
        predictive = 0
        for _, x in df.iterrows():
            _applicable, _predictive = self.assess(x)
            applicable += _applicable
            predictive += _predictive
        productivity = round(predictive / applicable, 3)
        coverage = round(applicable / n, 3)
        print('-' * 8)
        print('Productivity: %s' % productivity)
        print('Coverage: %s' % coverage)
    
    def assess(self, x):
        raise NotImplementedError

In [55]:
class AbsenceOfNot(Heuristic):
    
    def assess(self, x):
        if self.has_not(x.claim):
            return False, False
        if self.has_not(x.warrant0):
            return True, x.correctLabelW0orW1
        if self.has_not(x.warrant1):
            return True, not x.correctLabelW0orW1
        return False, False

In [56]:
aon = AbsenceOfNot()
aon.prod_cov(train_orig)
aon.corr(train_orig, preds_orig)
aon.prod_cov(train_neg)
aon.corr(train_neg, preds_neg)
aon.prod_cov(train_swap)

--------
Productivity: 0.323
Coverage: 0.44
Correlation with productive cases: -0.555 (0.000)
--------
Productivity: 0.464
Coverage: 0.348
Correlation with productive cases: 0.110 (0.000)
--------
Productivity: 0.325
Coverage: 0.44


This is a great one. It says that the opposite of this heuristic is very productive, and indeed latched onto in the original dataset. That the heuristic is far less sensitive is good for the adv_neg set.

In [40]:
class PresenceOfNot(Heuristic):
    
    def assess(self, x):
        if not self.has_not(x.claim):
            return False, False
        if self.has_not(x.warrant0):
            return True, not x.correctLabelW0orW1
        if self.has_not(x.warrant1):
            return True, x.correctLabelW0orW1
        return False, False

In [42]:
pon = PresenceOfNot()
pon.prod_cov(train_orig)
pon.corr(train_orig, preds_orig)
pon.prod_cov(train_neg)
pon.corr(train_neg, preds_neg)
pon.prod_cov(train_swap)

--------
Productivity: 0.718
Coverage: 0.252
Correlation with productive cases: 0.349 (0.000)
--------
Productivity: 0.463
Coverage: 0.344
Correlation with productive cases: -0.121 (0.000)
--------
Productivity: 0.716
Coverage: 0.252


In [45]:
class OppositeNot(Heuristic):
    
    def assess(self, x):
        if not self.has_not(x.claim):
            if self.has_not(x.warrant0):
                return True, not x.correctLabelW0orW1
            if self.has_not(x.warrant1):
                return True, x.correctLabelW0orW1
            return False, False
        else:  # claim has not
            if self.has_not(x.warrant0):
                return True, x.correctLabelW0orW1
            if self.has_not(x.warrant1):
                return True, not x.correctLabelW0orW1
            return False, False

In [50]:
on = OppositeNot()
on.prod_cov(train_orig)
on.corr(train_orig, preds_orig)
on.prod_cov(train_neg)
on.corr(train_neg, preds_neg)
on.prod_cov(train_swap)

--------
Productivity: 0.533
Coverage: 0.693
Correlation with productive cases: 0.217 (0.000)
--------
Productivity: 0.536
Coverage: 0.693
Correlation with productive cases: 0.052 (0.010)
--------
Productivity: 0.533
Coverage: 0.693


In [48]:
class MatchingNot(Heuristic):
    
    def assess(self, x):
        if self.has_not(x.claim):
            if self.has_not(x.warrant0):
                return True, not x.correctLabelW0orW1
            if self.has_not(x.warrant1):
                return True, x.correctLabelW0orW1
            return False, False
        else:  # claim has not not
            if self.has_not(x.warrant0):
                return True, x.correctLabelW0orW1
            if self.has_not(x.warrant1):
                return True, not x.correctLabelW0orW1
            return False, False

In [49]:
mn = MatchingNot()
mn.prod_cov(train_orig)
mn.corr(train_orig, preds_orig)
mn.prod_cov(train_neg)
mn.corr(train_neg, preds_neg)
mn.prod_cov(train_swap)

--------
Productivity: 0.467
Coverage: 0.693
Correlation with productive cases: -0.128 (0.000)
--------
Productivity: 0.464
Coverage: 0.693
Correlation with productive cases: -0.008 (0.681)
--------
Productivity: 0.467
Coverage: 0.693
