In [3]:
from pygate import *
from pygate.ext.google_nlp import SentimentAnalyserPR
from pygate.ext.spacy_io import SpacyDoc
from pygate.ext.textacy import KeyTermAnnotatorPR
from pygate.prs import SPMRulePR
from pygate.ext.relegence_nlp import RelEntityTagger

In [6]:
# from app.model import Clustering
# from app.tasks.clustering.clustering_method import ClusteringMethod
class ArticleCollection(DataSource):

    def __init__(self, collection_id, query=[]):
        '''
        :param query: set of keyword args for querying the docs
        '''
        self.query=query

    def get_articles(self):
        raise NotImplemented()

    def iter_docs(self):
        raise NotImplemented()
        
class StoryCollection(ArticleCollection):

    def __init__(self, story_id):
        self.story_id=story_id
        self.collection_id=story_id

    def get_articles(self):
        rs=RelegenceService()
        return rs.get_articles_by_story(self.story_id)

    def process(self,doc):
        pass

    def iter_docs(self):
        articles=self.get_articles()
        for a in articles:
            sdoc=SpacyDoc(a.text)
            sdoc['mongo']= a
            sdoc["id"] = a.article_id
            sdoc["title"] = a.title
            sdoc['url']=a.link
            yield sdoc

In [5]:
class DuplicateClearingPR(PR):
    def __init__(self):
        self.examined_titles = set()

    def process(self, doc):
        '''
        :type SpacyDoc
        :param doc:
        :return:
        '''
        title = doc["title"]
        doc.sents
        if title in self.examined_titles:
            raise ValueError("Duplicate Article: title"+title)
        self.examined_titles.add(title)



class CustomFeatureExtractor(PR):

    def process(self,doc):
        '''
        :type doc SpacyDoc
        :param doc:
        :return:
        '''
        for kt in doc['KeyTerm']:
            sents=doc.query_overlappedby_y(kt, 'Sentence')
            s=sents[0] #type: Annotation
            s.add_relation('key_term', kt)
            s.set_feature('key_terms', s.get_relation('key_term'))


class SentimentHighlighter(PR):

    def process(self,doc):
        '''
        :type doc SpacyDoc
        :param doc:
        :return:
        '''
        THRESHOLD=0.6

        pos=[]
        neg=[]
        for sent in doc.sents:
            if 'gs_score' in sent.features:
                score=sent.get_feature('gs_score')
                if score>THRESHOLD:
                    ann=Annotation(sent.text, sent.tStart, sent.tEnd, sent.cStart, sent.cEnd, 'PosSentiment',doc)
                    pos.append(ann)
                elif score< -1*THRESHOLD:
                    ann = Annotation(sent.text, sent.tStart, sent.tEnd, sent.cStart, sent.cEnd, 'NegSentiment', doc)
                    neg.append(ann)

        doc.set_annotation_set('PosSentiment', pos)
        doc.set_annotation_set('NegSentiment', neg)

class BratEmbeddingToMongoPR(PR):

    def __init__(self, anno_types):
        self.anno_types=anno_types

    def process(self,doc):
        art=doc['mongo']
        id=0
        art.entities=[]
        for anno_type in self.anno_types:
            if anno_type in doc:
                annots=doc[anno_type]
                for a in annots:
                    id+=1
                    art.entities.append(['T'+str(id), anno_type, [[a.cStart, a.cEnd]]])
        art.save()

def run_fv_generation_method(articles_collection):
    ann_store = AnnotationStore('Sentence')
    doc_store = DocumentStore('~')

    prs = [
        DuplicateClearingPR(),
        SentimentAnalyserPR('Sentence'),
        SentimentHighlighter(),
        KeyTermAnnotatorPR(),
        RelEntityTagger(),
        CustomFeatureExtractor(),
        BratEmbeddingToMongoPR(['KeyTerm', 'PosSentiment', 'NegSentiment', 'Entity']),
        ann_store, doc_store]

    pipe = Pipeline()
    pipe.setPRs(prs).setCorpus(articles_collection)

    result=pipe.process(5)
    for a in ann_store.annots:
        pass

In [7]:
run_fv_generation_method(StoryCollection(story_id='773932258236952576'))

ApplicationDefaultCredentialsError: The Application Default Credentials are not available. They are available if running in Google Compute Engine. Otherwise, the environment variable GOOGLE_APPLICATION_CREDENTIALS must be defined pointing to a file defining the credentials. See https://developers.google.com/accounts/docs/application-default-credentials for more information.