In [1]:
from __future__ import unicode_literals, print_function

import spacy
import os
import pickle
import pandas as pd
import plac
import random

from spacy_sentiws import spaCySentiWS
from spacy import displacy
from spacy.tokens import Token
from spacy.util import minibatch, compounding
from pathlib import Path

In [2]:
class SpacyTrainer(object):
    """
    Helperclass to train spacy NER and dependency parser
    """
    
    def __init__(self, output_dir):
        pass
    
    def train_ner(self, train_data, model=None, new_model_name="german_modified", 
                  output_dir=None, n_iter=30, labels=None, test_model=False):
        """Set up the pipeline and entity recognizer, and train the new entity."""
        # training data format:
        # TRAIN_DATA = [
        #     (
        #         "Horses are too tall and they pretend to care about your feelings",
        #         {"entities": [(0, 6, LABEL)]},
        #     ),
        #     ("Do they bite?", {"entities": []}),
        # ]
        TRAIN_DATA = train_data
        
        random.seed(0)
        # Add entity recognizer to model if it's not in the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner)
        # otherwise, get it, so we can add labels to it
        else:
            ner = nlp.get_pipe("ner")

        [ner.add_label(label) for label in labels]  # add new entity label to entity recognizer
        optimizer = nlp.resume_training()
        move_names = list(ner.move_names)
        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        train_losses = []
        with nlp.disable_pipes(*other_pipes):  # only train NER
            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                batches = minibatch(TRAIN_DATA, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
                # print("Losses", losses)
                train_losses.append(losses)

        # test the trained model
        test_text = "Do you like horses?"
        doc = nlp(test_text)
        print("Entities in '%s'" % test_text)
        displacy.render(doc, style='ent', jupyter=True)

        # save model to output directory
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.meta["name"] = new_model_name  # rename model
            nlp.to_disk(output_dir)
            print("Saved model to: ", output_dir)
            
            if test_model:
                # test the saved model
                print("Loading from", output_dir)
                nlp2 = spacy.load(output_dir)
                # Check the classes have loaded back consistently
                assert nlp2.get_pipe("ner").move_names == move_names
                doc2 = nlp2(test_text)
                for ent in doc2.ents:
                    print(ent.label_, ent.text)
                    
        return train_losses
    
    def train_dep(self, train_data, model=None, output_dir=None, n_iter=15, test_model=False):
        """Load the model, set up the pipeline and train the parser."""
        # training data format:
        # TRAIN_DATA = [
        #     (
        #         "They trade mortgage-backed securities.",
        #         {
        #             "heads": [1, 1, 4, 4, 5, 1, 1],
        #             "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
        #         },
        #     ),
        # ]
        
        TRAIN_DATA = train_data
        
        # add the parser to the pipeline if it doesn't exist
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "parser" not in nlp.pipe_names:
            parser = nlp.create_pipe("parser")
            nlp.add_pipe(parser, first=True)
        # otherwise, get it, so we can add labels to it
        else:
            parser = nlp.get_pipe("parser")

        # add labels to the parser
        for _, annotations in TRAIN_DATA:
            for dep in annotations.get("deps", []):
                parser.add_label(dep)

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
        train_losses = []
        with nlp.disable_pipes(*other_pipes):  # only train parser
            optimizer = nlp.begin_training()
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, sgd=optimizer, losses=losses)
                # print("Losses", losses)
                train_losses.append(losses)

        # test the trained model
        test_text = "I like securities."
        doc = nlp(test_text)
        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

        # save model to output directory
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)
            
            if test_model:
                # test the saved model
                print("Loading from", output_dir)
                nlp2 = spacy.load(output_dir)
                doc = nlp2(test_text)
                print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
            
        return train_losses

In [3]:
class SentiMax(object):
    """
    Analyze german text for determining polarity values of 
    several sentiment associated tokens. The polarity values are calculated 
    with the german SentiWS-Corpus, enhancing & silencing tokens (* 1.5 / * 0.5)
    and negations (* -1.0)
    """
    
    def __init__(self, wordvecs=False, sentiws_path='data/sentiws/', 
                 polarity_modifiers_path='data/polarity_modifiers.pickle'):
        # loading german spacy model
        if wordvecs:
            self.nlp = spacy.load('de_core_news_md')
        else:
            self.nlp = spacy.load('de_core_news_sm')
        # integrating SentiWS-Corpus as token attribute
        sentiws = spaCySentiWS(sentiws_path=sentiws_path)
        self.nlp.add_pipe(sentiws)
        self.doc = None
        self.modifiers = pickle.load(open(polarity_modifiers_path, 'rb'))
        if not Token.has_extension("modified"):
            Token.set_extension("modified", getter=self.modify_polarity)
        if not Token.has_extension("negated"):
            Token.set_extension("negated", getter=self.negate)
    
    def modify_polarity(self, token):
        children = token.children
        polarity = token._.sentiws
        if not polarity:
            polarity = 0.0
        for child in children:
            if child.lower_ in self.modifiers['polarity_enhancer']:
                return polarity * 1.5
            elif child.lower_ in self.modifiers['polarity_reducer']:
                return polarity * 0.5
        return polarity
    
    def negate(self, token):
        children = token.children
        polarity = token._.modified
        for child in children:
            if child.dep_ == 'ng':
                return -1.0 * polarity
        return polarity
        
    def polarize(self, text):
        self.doc = self.nlp(text)
    
    def sentimax(self):
        polarity_dict = {"token": [], "dep": [], "sentiws": [], "modified": [], "negated": []}
        if self.doc:
            for token in self.doc:
                polarity_dict["token"].append(token.text)
                polarity_dict["dep"].append(token.dep_)
                polarity_dict["sentiws"].append(token._.sentiws)
                polarity_dict["modified"].append(token._.modified)
                polarity_dict["negated"].append(token._.negated)
                
        return pd.DataFrame(data=polarity_dict)           
    
    def annotate_dependencies(self, texts_to_annotate, display=False):
        true_anns = []
        false_anns = []
        
        def format_deps(doc):
            deps = {"heads": [], "deps": []}
            for token in doc:
                deps["heads"].append(token.head)
                deps["deps"].append(token.dep_)
            return (doc.text, deps)
                    
        for i in range(len(texts_to_annotate)):
            self.doc = self.nlp(texts_to_annotate[i])
            if display:
                displacy.render(self.doc, style='dep', jupyter=True)
            else:
                deps = []
                for token in self.doc:
                    deps.append("({}, {}, {})".format(token.text, token.dep_, token.head))
                print(" ".join(deps))
            user_input = input("Is it wrong?: ")
            if user_input == '' or user_input == None:
                true_anns.append(format_deps(self.doc))
            elif user_input.lower() == 'y':
                false_anns.append(self.doc.text)
            elif user_input.lower() == 'b':
                i -= 2
            elif user_input.lower() == 'c' or user_input.lower() == 'q':
                break
        
        return true_anns, false_anns

In [4]:
sentimax = SentiMax()

In [5]:
sentimax.polarize('Google verkündet kein neues Smartphone auf den Markt zu bringen.') 

In [6]:
for token in sentimax.doc:
    print(token.text, token._.sentiws, token._.modified, token._.negated, token.ent_type_)

Google None 0.0 0.0 MISC
verkündet None 0.0 0.0 
kein None 0.0 0.0 
neues 0.004 0.004 0.004 
Smartphone None 0.0 0.0 
auf None 0.0 0.0 
den None 0.0 0.0 
Markt None 0.0 0.0 
zu None 0.0 0.0 
bringen None 0.0 0.0 
. None 0.0 0.0 


In [7]:
sentimax.sentimax()

Unnamed: 0,token,dep,sentiws,modified,negated
0,Google,sb,,0.0,0.0
1,verkündet,ROOT,,0.0,0.0
2,kein,nk,,0.0,0.0
3,neues,nk,0.004,0.004,0.004
4,Smartphone,oa,,0.0,0.0
5,auf,mo,,0.0,0.0
6,den,nk,,0.0,0.0
7,Markt,nk,,0.0,0.0
8,zu,pm,,0.0,0.0
9,bringen,oc,,0.0,0.0


In [8]:
class Entitizer(object):
    """
    Find Named-Entities in german texts 
    as keyword generator for sentimax
    """
    
    def __init__(self, wordvecs=False):
        # loading german spacy model
        if wordvecs:
            self.nlp = spacy.load('de_core_news_md')
        else:
            self.nlp = spacy.load('de_core_news_sm')
        self.doc = None
    
    def find_entities(self, text, visualize=False):
        self.doc = self.nlp(text)
        if visualize:
            displacy.render(self.doc, style='ent', jupyter=True)
        entities = [token for token in self.doc if token.ent_type_]
        return entities
    
    def annotate_entities(self, texts_to_annotate):
        true_anns = []
        false_anns = []
        
        def format_ents(doc):
            ents = {"entities": []}
            for token in doc:
                if token.ent_type_:
                    ents["entities"].append((token.idx, token.idx + len(token), token.ent_type_))
            return (doc.text, ents)
                    
        for i in range(len(texts_to_annotate)):
            self.doc = self.nlp(texts_to_annotate[i])
            displacy.render(self.doc, style='ent', jupyter=True)
            user_input = input("Is it wrong?: ")
            if user_input == '' or user_input == None:
                true_anns.append(format_ents(self.doc))
            elif user_input.lower() == 'y':
                false_anns.append(self.doc.text)
            elif user_input.lower() == 'b':
                i -= 2
            elif user_input.lower() == 'c' or user_input.lower() == 'q':
                break
        
        return true_anns, false_anns

In [9]:
entitizer = Entitizer()

In [10]:
ents = entitizer.find_entities('Google verkündet eine Kooperation mit Mark Zuckerberg einzugehen.')

In [11]:
reviews = pickle.load(open("Klinikbewertungen", 'rb'))

In [12]:
true_annotations, false_annotations = sentimax.annotate_dependencies(reviews)

(Sauber, ROOT, Sauber) (,, punct, Sauber) (freundliche, nk, Mitarbeiter) (Mitarbeiter, cj, Sauber) (und, cd, Mitarbeiter) (gutes, nk, Essen) (Essen, cj, und) (., punct, Sauber) (Überschattet, ROOT, Überschattet) (jedoch, mo, Überschattet) (davon, op, Überschattet) (,, punct, davon) (dass, cp, sind) (mein, nk, Arzt) (behandelnder, nk, Arzt) (Arzt, sb, sind) (mir, da, meinen) (nun, mo, meinen) (seit, mo, meinen) (5, nk, Tagen) (Tagen, nk, seit) (meinen, nk, Befund) (Befund, nk, Arzt) (nicht, ng, kundtut) (kundtut, mo, sind) (obwohl, cp, sind) (meine, nk, MRT) (MRT, sb, sind) (Ergebnisse, nk, MRT) (schon, mo, lange) (lange, mo, bereit) (bereit, pd, sind) (sind, re, davon) (., punct, Überschattet) (Einen, nk, Arztbrief) (Arztbrief, oa, vorlegen) (konnte, ROOT, konnte) (ich, sb, konnte) (meinem, nk, Urologen) (Urologen, da, ebensowenig) (ebensowenig, mo, vorlegen) (vorlegen, oc, konnte) (um, mo, vorlegen) (weitere, nk, Behandlungen) (Behandlungen, nk, um) (in, mo, ziehen) (Betracht, nk, in)