In [12]:
from __future__ import division  # Python 2 users only

import pandas as pd
from pygate import *
from pygate.prs.ml import *
from pygate.prs.rule import *
from pygate.utils import TreeUtils
import json
from nltk.tree import ParentedTree
import os
from pygate.export.brat import BratServer

### read in claim label dataframe

In [2]:
claim_df = pd.read_excel('./data/CE-ACL-14/2014_7_18_ibm_CDCdata.xls')
# evid_df=pd.read_excel('data/CE-ACL-14/2014_7_18_ibm_CDEdata.xls')
claim_df['Claim'] = claim_df['Claim'].apply(lambda x: x.encode("ascii", "ignore"))

### classes defined

In [3]:
class SentLabelAnnotator(PR):    
    def __init__(self, labelDataFrame):
        self.df=labelDataFrame
         
    def process(self, doc):
        df=self.df
        article_id =doc.getDocFeature('file_id');
        article_id=article_id.replace('_', ' ')        
        df= df[df['Article'] == article_id]
#         print "processing" , df.shape[0]
        for sent in doc.getSents():
            sent.setLabel('class',0)
            for claim in df['Claim']:
                if claim in sent.text:
                    sent.setLabel('class',1)                 

In [4]:
class ClauseLabelAnnotator(PR):    
    def __init__(self, labelDataFrame):
        self.df=labelDataFrame
         
    def process(self, doc):
        df=self.df
        article_id =doc.getId();
        article_id=article_id.replace('_', ' ')        
        df= df[df['Article'] == article_id]
#         print "processing" , df.shape[0]
        for clause in doc['SubClause']:
            clause.setLabel('class',0)
            for claim in df['Claim']:
                if claim in clause.text:
                    clause.setLabel('class',1) 
#                     clause.setLabel('claim-id', cla)

In [5]:
class SubClauseAnnotator(PR):
    def __init__(self):
        pass

    def process(self,doc):
        self.sBar=False
        self.clauses=[]
        sents=doc.getSents()

        for s in sents:
            parse=s.getFeature('constituency-parse')
            tree=ParentedTree.fromstring(parse)
            subclauses=[]
            self.traverseTree(tree, subclauses )
            annots=self.makeAnnotations(s, tree, subclauses)
            self.clauses.extend(annots)

        doc.setAnnotationSet('SubClause', self.clauses)

    def traverseTree(self, tree, clauses, sBar=False):
        label=tree.label()
#         print("node:", label )
        if(label=='SBAR'):
            sBar=True
        if(sBar and label=='S'):
            clauses.append(tree)
            sBar=False
        for subtree in tree:
            if type(subtree) == ParentedTree:
                self.traverseTree(subtree,clauses,sBar)

    def makeAnnotations(self, sent, tree, subclauses):
        annots=[]
        doc=sent.getDoc()
        if len(subclauses)>0:
#             tree.draw()
            for clause in subclauses:
                tokens=TreeUtils.getTokensWithinSubtree(sent, tree, clause )
                tStart=tokens[0].tStart
                tEnd=tokens[-1].tEnd+1
                cStart=tokens[0].cStart
                cEnd=tokens[-1].cEnd                
                text=doc.getText()[cStart:cEnd]
                ann=Annotation(text,tStart,tEnd,cStart,cEnd,'SubClause')
                ann.setFeature('sent-index', sent.getFeature('index'))
                ann.setRelation('tokens', tokens)
                annots.append(ann)
            for ann in annots:
                ann.setFeature('full-sent', False)
                ann.setFeature('num-clauses', len(annots))
        else:
            ann=Annotation(sent.text, sent.tStart, sent.tEnd, sent.cStart, sent.cEnd, 'SubClause')
            ann.setFeature('sent-index', sent.getFeature('index'))
            ann.setFeature('full-sent', True)
            ann.setRelation('tokens', sent.getRelation('tokens'))
            annots.append(ann)           
        sent.setRelation('subClauses', annots)
        return annots

In [None]:
# ds2=DocumentStore('./docstore/train_docs')
docStore=DocumentStore('./docstore/train_docs')
docStore.loadDocs(fileids='.*')

### Lets do Subclause annotations in 1 pipe. 

In [None]:
pipe_1=Pipeline(docStore)
prs_1=[
    SentLabelAnnotator(claim_df), # optional, we dont predict at sentence level
    SubClauseAnnotator(),
    ClauseLabelAnnotator(claim_df)
]
pipe_1.setPRs(prs_1)

In [None]:
results=pipe_1.process()
if len(results[1])>0: #if there were exceptions
    ex=results[1][1]
    ex.printStackTrace()

In [None]:
sampleDoc=docStore.docMap()['Criticism_of_atheism']

In [None]:
lbls=[a.getLabel('class') for a in sampleDoc['SubClause']]
print np.sum(lbls)
for sc in sampleDoc['SubClause']:
    if sc.getLabel('class')==1:
        print '_______________________'
        print sc
        print sc.getFeatures()

## Feature Extraction

In [6]:
import collections as coll

class FeatureExtractorPR(PR):
    def process(self, doc):
        for span in doc['SubClause']:
            tokens=span.getRelation('tokens')
            self.extractTokenFeatures(tokens, span)
   
    def extractTokenFeatures(self, tokens, span):          
        tknFeats=[t.getFeatures() for t in tokens] 
        ngramFeatures=self.extractNgrams(['pos'], 2, tokens)  
        posLemFeatures=self.lemmatizePos(['VBZ', 'RB'], tokens)
        span.updateFeatures(ngramFeatures)
        span.updateFeatures(posLemFeatures)
                   
    def extractNgrams(self, fnames, n, tokens):
        START='<S>'
        END='<E>'
        f_b={ key: START for key in fnames}       
        f_e={ key: END for key in fnames}
        ngrams=coll.Counter()
        feats=[tkn.getFeatures() for tkn in tokens]
        for i in range(n-1):
            feats.insert(0,f_b)
            feats.append(f_e)
            
        for i in range(len(feats)-n+1):
            merge=feats[i:i+n]
            for name in fnames:
                mergedFeat="".join( [name, str(n),'/' ]+[f[name]+"-" for f in merge])[0:-1]
                ngrams[mergedFeat]+=1            
        return ngrams 
    
    def lemmatizePos(self, pos_types, tokens):
        posFeats=coll.Counter()
        for t in tokens:
            feat=t.getFeatures()
            tkn_pos=feat['pos']
            if tkn_pos in pos_types:
                posFeats["lempos/"+tkn_pos+"-"+feat['lemma']]+=1
        return posFeats
                

In [None]:
fe=FeatureExtractorPR()
fe.process(sampleDoc)
sampleDoc['SubClause'][0].getFeatures()

### Machine learning

In [None]:
#Sample test
from nltk import MaxentClassifier

# model=MaxentClassifier.train([({},1)] )
scpr=ClassifierPR(MaxentClassifier, 'SubClause')
# scpr.process(sampleDoc)
# scpr.split()
# scpr.balanceDataset([(1,1),(0,1.5)])

# scpr.train(modelParams={'max_iter':1})
# scpr.save('sample.pr')
# cpr.train(modelParams={'max_iter':10})


## Make Pipeline for training

In [None]:
trainPipe=Pipeline(corpus=docStore)
cpr=ClassifierPR(MaxentClassifier, 'SubClause')
prs=[FeatureExtractorPR(),
     cpr]
trainPipe.setPRs(prs)

In [None]:
res=trainPipe.process()

In [None]:
cpr.split()
cpr.balanceDataset([(1,1),(0,2)])
cpr.train(modelParams={'max_iter':100})


In [None]:
cpr.save('./maxent.pr')


In [28]:
__author__ = 'sasinda'
from pygate import PR
from nltk import classify
import sklearn.metrics
import collections as coll
import numpy as np
from copy import copy
import pickle


class ClassifierPR(PR):
    '''
    @param model= needs to be a NLTK ML model. Wrap scikit models as NLTK's SklearnClassifier.
    @param outputKey = the label name that the precicted annotation will have.
    @param mode = inference: for running model against test samples
                  train: for collecting the training dataset.
                  Need to explicitly run train() after the pipeline is complete to start training the model.
    '''

    def __init__(self, modelClass, level, featureFilter=None, inputLabel='class', outputLabel='pred', mode='train'):
        self.modelClass = modelClass
        self.model = None
        self.mode = mode
        self.level = level
        self.outputLabel = outputLabel
        self.inputLabel = inputLabel
        self.filtr = featureFilter
        self.data_set = {'all': []}

    def process(self, doc):
        level = self.level
        outputKey = self.outputLabel
        if self.mode == 'train':
            self.collectTrainSet(doc)
        elif self.mode == 'inference':
            self.predict(doc)

    def collectTrainSet(self, doc):
        if self.filtr:
            feats = self.__filterFeatures(self.filtr)
        else:
            feats = [(t.getFeatures(), t.getLabel(self.inputLabel)) for t in doc[self.level]]
        self.data_set['all'].extend(feats)

    def __filterFeatures(self):
        raise NotImplemented

    def getDataSet(self, name):
        ''' trainSet is [(features:{}, label:str)] list of tuples'''
        return self.data_set[name]

    def split(self, splits={'validation': 0.2, 'train': 0.8}):
        '''splits the dataset into train and validation'''
        all_data = self.data_set['all']
        start = 0
        for key, value in splits.iteritems():
            split = int(len(all_data) * value)
            self.data_set[key] = all_data[start:start + split]
            start += split

    def train(self, modelParams={}, dataset_name='train', validate=True):
        '''
#         @param classRatios: list of tuples, class label and ratio.
          @param positive to negative ratio as a tuple.
        '''

        if not self.data_set.has_key(dataset_name): self.split()
        if not self.data_set.has_key(dataset_name): raise ValueError(
            "Please split the dataset before training. Call to this method ran split with default parameters, so you may use dataset_name as train, or explicitly split")

        train_set = self.data_set['train']
        self.model = self.modelClass.train(train_set, **modelParams)

        if validate: self.validate()

    def validate(self, dataset_name='validation'):

        if not self.data_set.has_key(dataset_name):  raise ValueError("Please split the dataset before validation")

        val_set = self.data_set[dataset_name]
        predlist = self.model.classify_many([feat for feat, label in val_set])
        truelist = []
        for i in range(len(predlist)):
            actual_label = val_set[i][1]
            truelist.append(actual_label)

        print 'pos precision:', sklearn.metrics.precision_score(truelist, predlist, pos_label=1)
        print 'pos recall:', sklearn.metrics.recall_score(truelist, predlist, pos_label=1)
        print 'pos F-measure:', sklearn.metrics.f1_score(truelist, predlist, pos_label=1)
        print 'neg precision:', sklearn.metrics.precision_score(truelist, predlist, pos_label=0)
        print 'neg recall:', sklearn.metrics.recall_score(truelist, predlist, pos_label=0)
        print 'neg F-measure:', sklearn.metrics.f1_score(truelist, predlist, pos_label=0)

    def trainCrossValidate(self):
        raise NotImplemented

    def balanceDataset(self, classRatios, data_set_from='train', data_set_to='train', randomShuffle=True):
        balancer = coll.defaultdict(list)
        for datum in self.data_set[data_set_from]:
            cls = datum[1]
            balancer[cls].append(datum)

        common_denom = len(self.data_set[data_set_from])
        for cls, ratio_term in classRatios:
            common_denom = min(common_denom, len(balancer[cls]) / ratio_term)

        balancedSet = []
        for cls, ratio_term in classRatios:
            balancedSet.extend(balancer[cls][:int(common_denom * ratio_term)])

        if randomShuffle:
            np.random.shuffle(balancedSet)
        self.data_set[data_set_to] = balancedSet

    def predict(self, doc):
        feats = [t.getFeatures() for t in doc[self.level]]
        preds = self.model.classify_many(feats)
        for i, ann in enumerate(doc[self.level]):
            ann.setLabel(self.outputLabel, preds[i])

    def save(self, prFile, setMode='inference'):
        me = self
        me.mode = setMode
        if setMode == 'inference':
            me = copy(self)
            me.data_set = None
            me.balanced_data_set = None

        super(self.__class__, me).save(prFile)

## Run Model

In [29]:
from pygate.ext.stanford import StanfordAnnotator

In [30]:
cprProd=ClassifierPR.load('./maxent.pr')
def printPreds(doc):
    for sc in doc['SubClause']:
        if sc.getLabel('pred')==1:
            print sc

In [39]:
claimAnn=SPMRulePR('@SubClause.labels.pred==1 -->  @Claim')
entityAnn= SPMRulePR("(@Token.features.ner!='O')+:span --> @Entity:e ->  e.subType=span.features.ner")

In [42]:
pipe=Pipeline()
prs=[StanfordAnnotator(cacheDir='./inf_corenlp'),
     SubClauseAnnotator(),
     FeatureExtractorPR(),     
     cprProd,
     claimAnn,
     entityAnn
    ]
pipe.setPRs(prs)

In [49]:
news1=Document(
u"""Jimmy Carter, Seeing Resurgence of Racism, Plans Baptist Conference for Unity.

Former President Jimmy Carter, who has long put religion and racial reconciliation at the center of his life, is on a mission to heal a racial divide among Baptists and help the country soothe rifts that he believes are getting worse.

In an interview on Monday, Mr. Carter spoke of a resurgence of open racism, saying, “I don’t feel good, except for one thing: I think the country has been reawakened the last two or three years to the fact that we haven’t resolved the race issue adequately.”

He said that Republican animosity toward President Obama had “a heavy racial overtone” and that Donald J. Trump’s surprisingly successful campaign for president had “tapped a waiting reservoir there of inherent racism.”

Mr. Carter conducted telephone interviews to call attention to a summit meeting he plans to hold in Atlanta this fall to bring together white, black, Hispanic and Asian Baptists to work on issues of race and social inequality. Mr. Carter began the effort, called the New Baptist Covenant, in 2007, but it has taken root in only a few cities. The initiative is expanding to enlist Baptist congregations across the country to unite across racial lines.

Mr. Carter, 91, began treatment last year for cancer that had started in his liver and spread to his brain. He announced in December that doctors had found him free of cancer but that he was still receiving treatments for metastatic melanoma. On Monday, he said he was feeling well.

Mr. Carter, a Democrat who was the 39th president, grew up on a farm in Plains, Ga., where many of his friends were the black children of neighboring farmhands. He was raised a Southern Baptist and was the first United States president to call himself a born-again Christian, bringing national attention to the evangelical movement.

Mr. Carter said the election of Mr. Obama was a hopeful sign, but he added, “I think there’s a heavy reaction among some of the racially conscious Republicans against an African-American being president.”

He said recent reports showing high unemployment and incarceration rates among black people, “combined with the white police attacks on innocent blacks,” had “reawakened” the country to the realization that racism was not resolved in the 1960s and ’70s.

He said Mr. Trump had violated “basic human rights” when he referred to Mexican immigrants as criminals and called for a ban on Muslims’ entering the country.

“When you single out any particular group of people for secondary citizenship status, that’s a violation of basic human rights,” said Mr. Carter, who won the Nobel Peace Prize in 2002 for his work with the Carter Center in promoting human rights and democracy in many countries.

Asked why polls showed high support among evangelical Christians for Mr. Trump’s candidacy, Mr. Carter said: “The use of the word evangelical is a misnomer. I consider myself an evangelical as well. And obviously, what most of the news reporters thought were evangelicals are conservative Republicans.”

“They have a heavy orientation to right-wing political philosophy, and he obviously is a proponent of that concept,” Mr. Carter said, referring to Mr. Trump.

He pointed out that the evangelicals in the Southern Baptist Convention had aligned themselves with the Republican Party and organized the Moral Majority, a conservative Christian political group, only in the late 1970s, while he was president. Mr. Carter announced that he was leaving the Southern Baptist Convention in 2000, after the denomination solidified its turn to the right and declared that it would not accept women as pastors.

Mr. Carter founded the New Baptist Covenant by reaching out to black and white Baptist associations, many of which had split many years ago over slavery. Nearly 15,000 people from 30 Baptist associations attended the founding meeting in 2008.

Hannah McMahan, the executive director of the New Baptist Covenant, said the group had been in a “pilot phase” for the last two years. She said black and white churches had formed partnerships, called covenants, in Dallas; Macon, Ga.; St. Louis; Birmingham, Ala.; and Atlanta.

But the process is painstaking, Ms. McMahan said, adding, “What this has given me an appreciation for is how deep the divides are, and that this kind of work will not happen overnight.”

The work is especially challenging in this climate, said the Rev. Raphael G. Warnock, the senior pastor of Ebenezer Baptist Church in Atlanta, the church where the Rev. Martin Luther King Jr. was once a pastor. Ebenezer Baptist is participating in the New Baptist Covenant.

“This is a dark moment in our national conversation,” Pastor Warnock said. “Those of us who understand that we are better together had better raise our voices, because there are others who are trafficking in theater, in paranoia, and they ply the trade of fear as part of their political craft.”

However, he said, “I’m much more fired up than discouraged, because the ugliness of the rhetoric we’re seeing in this election cycle really just brings into sharp focus the ugly underbelly of bigotry that has always been there.”
"""               )
news1.setId('sample1')
pipe.processDoc(news1)

doc([Jimmy Carter, Seeing Resurgence of Racism, Plans Baptist Conference for Unity.//Sentence:0//tidx0:14]//sentf[{'dep-parse': 'not implemented!', 'index': 0, 'constituency-parse': u'(ROOT\n  (S\n    (NP\n      (//tknf[{'lemma': u'Jimmy', 'ner': u'PERSON', 'pos': u'NNP', 'index': 1}])

In [50]:
printPreds(news1)



Mr. Carter conducted telephone interviews to call attention to a summit meeting he plans to hold in Atlanta this fall to bring together white, black, Hispanic and Asian Baptists to work on issues of race and social inequality.//SubClause:9//tidx153:194
there’s a heavy reaction among some of the racially conscious Republicans against an African-American being president//SubClause:20//tidx375:392
this has given me an appreciation for is how deep the divides are, and that this kind of work will not happen overnight//SubClause:40//tidx801:825
there are others who are trafficking in theater, in paranoia//SubClause:50//tidx911:922
are trafficking in theater, in paranoia//SubClause:51//tidx915:922


In [51]:
bs=BratServer(dataDir='~/CMProject/viz/brat-v1.3_Crunchy_Frog/data/povmap')
bs.draw(news1, ['Entity', 'Claim'])

[Jimmy//Token:0//tidx0:0,
 Carter//Token:1//tidx1:1,
 Plans//Token:8//tidx8:8,
 Baptist//Token:9//tidx9:9,
 Conference//Token:10//tidx10:10,
 for//Token:11//tidx11:11,
 Unity//Token:12//tidx12:12,
 Jimmy//Token:16//tidx16:16,
 Carter//Token:17//tidx17:17,
 Baptists//Token:44//tidx44:44,
 Monday//Token:62//tidx62:62,
 Carter//Token:65//tidx65:65,
 one//Token:85//tidx85:85,
 last//Token:96//tidx96:96,
 two//Token:97//tidx97:97,
 or//Token:98//tidx98:98,
 three//Token:99//tidx99:99,
 years//Token:100//tidx100:100,
 Republican//Token:118//tidx118:118,
 Obama//Token:122//tidx122:122,
 Donald//Token:132//tidx132:132,
 J.//Token:133//tidx133:133,
 Trump//Token:134//tidx134:134,
 Carter//Token:154//tidx154:154,
 Atlanta//Token:170//tidx170:170,
 this//Token:171//tidx171:171,
 fall//Token:172//tidx172:172,
 Asian//Token:182//tidx182:182,
 Baptists//Token:183//tidx183:183,
 Carter//Token:195//tidx195:195,
 2007//Token:207//tidx207:207,
 Baptist//Token:226//tidx226:226,
 Carter//Token:238//tidx23

{'pred': 1}