In [2]:
from stanfordcorenlp import StanfordCoreNLP
import logging
import json
import pandas as pd
# from nltk.parse.stanford import StanfordDependencyParser
import os

from graphviz import Source
pd.set_option('display.max_colwidth', -1)

In [3]:
class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def dependency_parse(self, sentence):
        return self.nlp.dependency_parse(sentence)

    def annotate(self, sentence):
        return json.loads(self.nlp.annotate(sentence, properties=self.props))

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens

In [4]:
df = pd.read_csv('newsdataset1.csv')
df.event_summary=df.event_summary.astype(str)

In [5]:
df['event_summary'].str.replace('(','').astype(str)
df['event_summary'].str.replace(')','').astype(str)
df['event_summary'].str.replace('[','').astype(str)
df['event_summary'].str.replace(']','').astype(str)
df['event_summary'].str.replace('"','').astype(str)
df['event_summary'].str.replace('’','').astype(str)


0        A shooting takes place at a pub in Tel Aviv                                                                                                                                                                         
1        About one thousand houses in Manila's Tondo district in the Philippines are set ablaze following New Year's Eve firecracker festivities that left one dead and 380 others injured. AP via CTV News                  
2        The EU-Ukraine Free Trade deal officially comes into force                                                                                                                                                          
3        The two-child policy takes effect in China                                                                                                                                                                          
4        /wiki/List_of_politicians_killed_in_the_Mexican_Drug_War                                               

In [6]:



indexes=df[df['event_summary'].map(len)  < 35].index
# print(indexes)
df=df.drop(indexes)
index2=df[df['event_summary']=='nan'].index
df=df.drop(index2)

In [7]:
df=df.reset_index()

In [11]:
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
if __name__ == '__main__':
    sNLP = StanfordNLP()
    for i in range(0, 10):
        print(i)
        text = df['event_summary'][i]
#         print ("Annotate:", sNLP.annotate(text))
        print('\n\n')
        print(text)
#         print('\n\n')
#         print ("POS:", sNLP.pos(text))
#         print('\n\n')
#         print ("Tokens:", sNLP.word_tokenize(text))
        print('\n\n')
#         print ("NER:", sNLP.ner(text))
#         df['NER'][i]=sNLP.ner(text)

        print('\n\n')
        print ("Parse:", sNLP.parse(text))
        print ("Dep Parse:", sNLP.dependency_parse(text))
#         sentence=sNLP.parse(text)
#         parser = CoreNLPParser()
#         next(parser.raw_parse(text)).pretty_print()
        
#         sdp =  StanfordDependencyParser()
#         next(sdp.raw_parse(text)).pretty_print()
#         result = list(sdp.raw_parse(text))

#         dep_tree_dot_repr = [parse for parse in result][0].to_dot()
#         source = Source(dep_tree_dot_repr, filename="dep_tree", format="png")
#         source.view()

        
        print('\n\n .......................................................................... \n\n')
            

0



A shooting takes place at a pub in Tel Aviv






Parse: (ROOT
  (S
    (NP (DT A) (NN shooting))
    (VP (VBZ takes)
      (NP (NN place))
      (PP (IN at)
        (NP
          (NP (DT a) (NN pub))
          (PP (IN in)
            (NP (NNP Tel) (NNP Aviv))))))))
Dep Parse: [('ROOT', 0, 3), ('det', 2, 1), ('nsubj', 3, 2), ('dobj', 3, 4), ('case', 7, 5), ('det', 7, 6), ('nmod', 3, 7), ('case', 10, 8), ('compound', 10, 9), ('nmod', 7, 10)]


 .......................................................................... 


1



About one thousand houses in Manila's Tondo district in the Philippines are set ablaze following New Year's Eve firecracker festivities that left one dead and 380 others injured. AP via CTV News






Parse: (ROOT
  (S
    (PP (IN About)
      (NP (CD one)))
    (NP
      (NP (CD thousand) (NNS houses))
      (PP (IN in)
        (NP
          (NP
            (NP (NNP Manila) (POS 's))
            (NNP Tondo) (NN district))
          (PP (IN in)
            (NP

Dep Parse: [('ROOT', 0, 5), ('compound', 2, 1), ('nsubj', 5, 2), ('aux', 5, 3), ('aux', 5, 4), ('case', 11, 6), ('det', 11, 7), ('amod', 11, 8), ('amod', 11, 9), ('compound', 11, 10), ('nmod', 5, 11), ('case', 17, 12), ('nmod:poss', 17, 13), ('amod', 17, 14), ('amod', 17, 15), ('compound', 17, 16), ('nmod', 11, 17)]


 .......................................................................... 




In [69]:
df['NER']

0        [(A, O), (shooting, CAUSE_OF_DEATH), (takes, O), (place, O), (at, O), (a, O), (pub, O), (in, O), (Tel, CITY), (Aviv, CITY)]                                                                                                                                                                                                                                                                                                                                                                                 
1        [(About, O), (one, NUMBER), (thousand, NUMBER), (houses, O), (in, O), (Manila, CITY), ('s, O), (Tondo, LOCATION), (district, O), (in, O), (the, O), (Philippines, COUNTRY), (are, O), (set, O), (ablaze, O), (following, DATE), (New, DATE), (Year, DATE), ('s, DATE), (Eve, DATE), (firecracker, O), (festivities, O), (that, O), (left, O), (one, NUMBER), (dead, O), (and, O), (380, NUMBER), (others, O), (injured, O), (., O), (AP, ORGANIZATION), (via, O), (CTV, ORGANIZATION), (News, ORGANIZATIO

In [67]:
from nltk.parse import CoreNLPParser
sNLP = StanfordNLP()
# nlp = StanfordNLP()
# result = CoreNLPParser()
for i in range(0,10):
    text = df.loc[i,'event_summary']
    df.loc[i,"NER"] = sNLP.ner(text)
# sNLP.ner(text)
# pos = []
# for word in result["sentences"][2]["tokens"]:
#     pos.append('{} ({})'.format(word["word"], word["pos"]))
    
# " ".join(pos)

ValueError: Must have equal len keys and value when setting with an ndarray

In [24]:
print(result)

[('UK', 'COUNTRY'), ('is', 'O'), ('my', 'O'), ('favorite', 'O'), ('place', 'O')]
