In [1]:
%load_ext autoreload
%autoreload 2
#Reload all modules (except those excluded by %aimport) automatically now.

In [2]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    """Overload the spaCy built-in rendering to allow custom part-of-speech (POS) tags.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    column -- the name of of a column of interest in the dataframe
    options -- various options to feed into the spaCy renderer, including colors
    page -- rendering markup as full HTML page (default False)
    minify -- for compact HTML (default False)
    idx -- index for specific query or doc in dataframe (default 0)
    
    """
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

def parse_custom_ents(doc, df, idx, column):
    """Parse custom entity types that aren't in the original spaCy module.
    
    Keyword arguments:
    doc -- a spaCy nlp doc object
    df -- a pandas dataframe object
    idx -- index for specific query or doc in dataframe
    column -- the name of of a column of interest in the dataframe
    
    """
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}

def render_entities(idx, df, options={}, column='named_ents'):
    """A wrapper function to get text from a dataframe and render it visually in jupyter notebooks
    
    Keyword arguments:
    idx -- index for specific query or doc in dataframe (default 0)
    df -- a pandas dataframe object
    options -- various options to feed into the spaCy renderer, including colors
    column -- the name of of a column of interest in the dataframe (default 'named_ents')
    
    """
    text = df['event_summary'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [4]:
options = {
    'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}
}

In [5]:
pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [6]:
import csv
import sys, os, json, re, time
import subprocess

In [7]:
#file = './nips-2015-papers/Papers.csv'
#file = '/uci-news-aggregator.csv'
df = pd.read_csv('newsdataset1.csv')
df.event_summary=df.event_summary.astype(str)


In [8]:
df.describe()

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
count,12817,12701,12636,12601,12817,12299,12042,11786,11181,9783,...,2,2,2,1,1,1,1,1,1,1
unique,12814,1869,853,3259,11220,10017,9611,8658,7958,6994,...,2,2,2,1,1,1,1,1,1,1
top,According to the Syrian Observatory for Human ...,"2016-04-22T00:00:00.000Z""""",Armed conflicts and attacks,FALSE,"_2016""]""",FALSE,FALSE,FALSE,FALSE,FALSE,...,/wiki/Holland,/wiki/Belgium,http://www.msn.com/en-us/news/politics/media-s...,/wiki/State_of_Palestine,/wiki/Turkey,"/wiki/Greece]""",False,[https://www.independent.co.uk/news/world/euro...,http://www.euronews.com/2016/03/20/at-least-14...,https://www.bloomberg.com/news/articles/2016-0...
freq,2,32,2309,5216,311,123,572,1452,1879,1798,...,1,1,1,1,1,1,1,1,1,1


In [9]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

Unnamed: 0,_id,date,category,event_title,event_summary,entities,person,external_link,Unnamed: 8,Unnamed: 9,...,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58
6014,"ObjectId(""5d056c69b5f3a01b25e87549"")","2016-03-20T00:00:00.000Z""""",Disasters and accidents,"[/wiki/Erasmus_bus_crash""]""",Thirteen are killed and 34 others injured afte...,crashes near Freginals,Spain. The regional government of Catalonia s...,"""... according to the latest data",the ill-fated bus had students from Hungary,Germany,...,/wiki/Holland,/wiki/Belgium,/wiki/France,/wiki/State_of_Palestine,/wiki/Turkey,"/wiki/Greece]""",False,[https://www.independent.co.uk/news/world/euro...,http://www.euronews.com/2016/03/20/at-least-14...,https://www.bloomberg.com/news/articles/2016-0...


In [10]:
mini_df = df[:10]
mini_df.index = pd.RangeIndex(len(mini_df.index))

# comment this out to run on full dataset
df = mini_df

In [11]:
lower = lambda x: x.lower() # make everything lowercase

In [12]:
df = pd.DataFrame(df['event_summary'].apply(lower))
df.columns = ['event_summary']
display(df)

Unnamed: 0,event_summary
0,the death toll of the suicide bombing in ibb
1,a shooting kills one and injures six people in...
2,lithuania adopts the euro as its official curr...
3,u.s. fast-food restaurant chain chick-fil-a wa...
4,the palestine authority signs a treaty to join...
5,the eurasian economic union between russia
6,vietnam’s new marriage law goes into effect
7,mario cuomo
8,somali al-shabaab militants attack an army bas...
9,boko haram militants attack a bus in waza


In [13]:
 import spacy

In [14]:
nlp = spacy.load("en_core_web_sm")

In [15]:
def extract_named_ents(text):
    """Extract named entities, and beginning, middle and end idx using spaCy's out-of-the-box model. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in nlp(text).ents]

def add_named_ents(df):
    """Create new column in data frame with named entity tuple extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_ents'] = df['event_summary'].apply(extract_named_ents)   

In [16]:
add_named_ents(df)

In [17]:
df['event_summary'].count()

10

In [18]:
df

Unnamed: 0,event_summary,named_ents
0,the death toll of the suicide bombing in ibb,[]
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN..."
2,lithuania adopts the euro as its official curr...,[]
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]"
4,the palestine authority signs a treaty to join...,[]
5,the eurasian economic union between russia,[]
6,vietnam’s new marriage law goes into effect,[]
7,mario cuomo,[]
8,somali al-shabaab militants attack an army bas...,[]
9,boko haram militants attack a bus in waza,[]


In [19]:
column = 'named_ents'
render_entities(9, df, options=options, column=column) # take a look at one of the abstracts

In [20]:
def extract_nouns(text):
    """Extract a few types of nouns, and beginning, middle and end idx using spaCy's POS (part of speech) tagger. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

def add_nouns(df):
    """Create new column in data frame with nouns extracted.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['nouns'] = df['event_summary'].apply(extract_nouns)

In [21]:
add_nouns(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,..."
5,the eurasian economic union between russia,[],"[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]"
6,vietnam’s new marriage law goes into effect,[],"[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m..."
7,mario cuomo,[],"[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]"
8,somali al-shabaab militants attack an army bas...,[],"[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),..."
9,boko haram militants attack a bus in waza,[],"[(haram, 5, 10, NOUN), (militants, 11, 20, NOU..."


In [22]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [23]:
def extract_named_nouns(row_series):
    """Combine nouns and non-numerical entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    ents = set()
    idxs = set()
    # remove duplicates and merge two lists together
    for noun_tuple in row_series['nouns']:
        for named_ents_tuple in row_series['named_ents']:
            if noun_tuple[1] == named_ents_tuple[1]: 
                idxs.add(noun_tuple[1])
                ents.add(named_ents_tuple)
        if noun_tuple[1] not in idxs:
            ents.add(noun_tuple)
    
    return sorted(list(ents), key=lambda x: x[1])

def add_named_nouns(df):
    """Create new column in data frame with nouns and named ents.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['named_nouns'] = df.apply(extract_named_nouns, axis=1)

In [24]:
add_named_nouns(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,..."
5,the eurasian economic union between russia,[],"[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]"
6,vietnam’s new marriage law goes into effect,[],"[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m..."
7,mario cuomo,[],"[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]"
8,somali al-shabaab militants attack an army bas...,[],"[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),..."
9,boko haram militants attack a bus in waza,[],"[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(haram, 5, 10, NOUN), (militants, 11, 20, NOU..."


In [25]:
column = 'named_nouns'
render_entities(1, df, options=options, column=column)

In [26]:
text = "Dr. Abraham is the primary author of this paper, and a physician in the specialty of internal medicine."

spacy.displacy.render(nlp(text), jupyter=True) # generating raw-markup using spacy's built-in renderer

In [27]:
def extract_noun_phrases(text):
    """Combine noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

def add_noun_phrases(df):
    """Create new column in data frame with noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['noun_phrases'] = df['event_summary'].apply(extract_noun_phrases)

In [28]:
def visualize_noun_phrases(text):
    """Create a temporary dataframe to extract and visualize noun phrases. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    df = pd.DataFrame([text]) 
    df.columns = ['event_summary']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [29]:
visualize_noun_phrases(text)



In [30]:
add_noun_phrases(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,..."
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)..."
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea..."
5,the eurasian economic union between russia,[],"[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(the eurasian economic union, 0, 27, NP), (ru..."
6,vietnam’s new marriage law goes into effect,[],"[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, NP), (new marriage law, 10, 2..."
7,mario cuomo,[],"[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario cuomo, 0, 11, NP)]"
8,somali al-shabaab militants attack an army bas...,[],"[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(somali al-shabaab militants, 0, 27, NP), (an..."
9,boko haram militants attack a bus in waza,[],"[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(boko haram militants, 0, 20, NP), (a bus, 28..."


In [31]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

In [32]:
def extract_compounds(text):
    """Extract compound noun phrases with beginning and end idxs. 
    
    Keyword arguments:
    text -- the actual text source from which to extract entities
    
    """
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':

            # capture hyphenated compounds
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # remember starting index of first child in compound or word
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps

def add_compounds(df):
    """Create new column in data frame with compound noun phrases.
    
    Keyword arguments:
    df -- a dataframe object
    
    """
    df['compounds'] = df['event_summary'].apply(extract_compounds)

In [33]:
add_compounds(df)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[]
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall..."
5,the eurasian economic union between russia,[],"[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(the eurasian economic union, 0, 27, NP), (ru...",[]
6,vietnam’s new marriage law goes into effect,[],"[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, NP), (new marriage law, 10, 2...","[(marriage law, 14, 26, COMPOUND)]"
7,mario cuomo,[],"[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario cuomo, 0, 11, NP)]","[(mario cuomo, 0, 11, COMPOUND)]"
8,somali al-shabaab militants attack an army bas...,[],"[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(somali al-shabaab militants, 0, 27, NP), (an...","[(al -, 7, 11, COMPOUND), (al-shabaab militant..."
9,boko haram militants attack a bus in waza,[],"[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(boko haram militants, 0, 20, NP), (a bus, 28...","[(boko haram militants, 0, 20, COMPOUND)]"


In [34]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [35]:
def extract_comp_nouns(row_series, cols=[]):
    """Combine compound noun phrases and entities. 
    
    Keyword arguments:
    row_series -- a Pandas Series object
    
    """
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}

def add_comp_nouns(df, cols=[]):
    """Create new column in data frame with merged entities.
    
    Keyword arguments:
    df -- a dataframe object
    cols -- a list of column names that need to be merged
    
    """
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)

In [36]:
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi...","{suicide, death toll, ibb, death, toll, suicid..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]","{people, shooting kills, killarney, shooting}"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[],"{lithuania, currency, euro}"
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick...","{credit, credit card details, breach, restaura..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall...","{palestine, wall, authority, palestine authori..."
5,the eurasian economic union between russia,[],"[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(union, 22, 27, NOUN), (russia, 36, 42, NOUN)]","[(the eurasian economic union, 0, 27, NP), (ru...",[],"{union, russia}"
6,vietnam’s new marriage law goes into effect,[],"[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, PROPN), (’s, 7, 9, PROPN), (m...","[(vietnam, 0, 7, NP), (new marriage law, 10, 2...","[(marriage law, 14, 26, COMPOUND)]","{vietnam, marriage, law, effect, ’s, marriage ..."
7,mario cuomo,[],"[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario, 0, 5, NOUN), (cuomo, 6, 11, NOUN)]","[(mario cuomo, 0, 11, NP)]","[(mario cuomo, 0, 11, COMPOUND)]","{mario cuomo, cuomo, mario}"
8,somali al-shabaab militants attack an army bas...,[],"[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(al, 7, 9, PROPN), (militants, 18, 27, NOUN),...","[(somali al-shabaab militants, 0, 27, NP), (an...","[(al -, 7, 11, COMPOUND), (al-shabaab militant...","{al, outskirts, army, al -, al-shabaab militan..."
9,boko haram militants attack a bus in waza,[],"[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(haram, 5, 10, NOUN), (militants, 11, 20, NOU...","[(boko haram militants, 0, 20, NP), (a bus, 28...","[(boko haram militants, 0, 20, COMPOUND)]","{bus, boko haram militants, waza, haram, milit..."


In [37]:
# take a look at all the nouns again
column = 'named_nouns'
render_entities(0, df, options=options, column=column)

In [38]:
# take a look at all the compound noun phrases again
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [39]:
# take a look at combined entities
df['comp_nouns'][0] 

{'bombing', 'death', 'death toll', 'ibb', 'suicide', 'suicide bombing', 'toll'}

In [40]:
df.head()


Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi...","{suicide, death toll, ibb, death, toll, suicid..."
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]","{people, shooting kills, killarney, shooting}"
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[],"{lithuania, currency, euro}"
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick...","{credit, credit card details, breach, restaura..."
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall...","{palestine, wall, authority, palestine authori..."


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
event_summary    10 non-null object
named_ents       10 non-null object
nouns            10 non-null object
named_nouns      10 non-null object
noun_phrases     10 non-null object
compounds        10 non-null object
comp_nouns       10 non-null object
dtypes: object(7)
memory usage: 640.0+ bytes


In [42]:
df.comp_nouns.value_counts()

{union, russia}                                                                                                                                1
{bus, boko haram militants, waza, haram, militants}                                                                                            1
{people, shooting kills, killarney, shooting}                                                                                                  1
{suicide, death toll, ibb, death, toll, suicide bombing, bombing}                                                                              1
{vietnam, marriage, law, effect, ’s, marriage law}                                                                                             1
{al, outskirts, army, al -, al-shabaab militants, army base, baidoa, base, militants}                                                          1
{palestine, wall, authority, palestine authority, treaty, court, wall street journal, journal, street}                            

In [43]:
df.compounds.value_counts()

[]                                                                                                                                                     2
[(marriage law, 14, 26, COMPOUND)]                                                                                                                     1
[(shooting kills, 2, 16, COMPOUND)]                                                                                                                    1
[(death toll, 4, 14, COMPOUND), (suicide bombing, 22, 37, COMPOUND)]                                                                                   1
[(mario cuomo, 0, 11, COMPOUND)]                                                                                                                       1
[(boko haram militants, 0, 20, COMPOUND)]                                                                                                              1
[(palestine authority, 4, 23, COMPOUND), (wall street journal, 101, 120, COMPOUND)

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['event_summary'])

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['compounds'], test_size=0.3, random_state=1)

In [46]:
# from __future__ import unicode_literals
# import spacy,en_core_web_sm
# import textacy
# nlp = en_core_web_sm.load()
# sentence = 'The author is writing a new book.'
# pattern = r'<VERB>?<ADV>*<VERB>+'
# doc = textacy.Doc(sentence, lang='en_core_web_sm')
# lists = textacy.extract.pos_regex_matches(doc, pattern)
# for list in lists:
#     print(list.text)

In [47]:

text=df['event_summary']
text1 = str(text)
doc = nlp(text1)

In [48]:
#df1=df.columns = ['event_summary']
for i, token in enumerate(doc):
    df.loc[i, 'text'] = token.text
    df.loc[i, 'pos'] = token.pos_
    df.loc[i, 'dep'] = token.dep_

In [49]:
df

Unnamed: 0,event_summary,named_ents,nouns,named_nouns,noun_phrases,compounds,comp_nouns,text,pos,dep
0,the death toll of the suicide bombing in ibb,[],"[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(death, 4, 9, NOUN), (toll, 10, 14, NOUN), (s...","[(the death toll, 0, 14, NP), (the suicide bom...","[(death toll, 4, 14, COMPOUND), (suicide bombi...","{suicide, death toll, ibb, death, toll, suicid...",0,PUNCT,nummod
1,a shooting kills one and injures six people in...,"[(one, 17, 20, CARDINAL), (six, 33, 36, CARDIN...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(shooting, 2, 10, NOUN), (people, 37, 43, NOU...","[(six people, 33, 43, NP), (killarney, 47, 56,...","[(shooting kills, 2, 16, COMPOUND)]","{people, shooting kills, killarney, shooting}",,SPACE,
2,lithuania adopts the euro as its official curr...,[],"[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, PROPN), (euro, 21, 25, NOUN...","[(lithuania, 0, 9, NP), (the euro, 17, 25, NP)...",[],"{lithuania, currency, euro}",the,DET,det
3,u.s. fast-food restaurant chain chick-fil-a wa...,"[(9, 112, 113, CARDINAL)]","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(food, 10, 14, NOUN), (restaurant, 15, 25, NO...","[(fast-food restaurant chain, 5, 31, NP), (a s...","[(restaurant chain, 15, 31, COMPOUND), (-chick...","{credit, credit card details, breach, restaura...",death,NOUN,compound
4,the palestine authority signs a treaty to join...,[],"[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(palestine, 4, 13, NOUN), (authority, 14, 23,...","[(the palestine authority, 0, 23, NP), (a trea...","[(palestine authority, 4, 23, COMPOUND), (wall...","{palestine, wall, authority, palestine authori...",toll,NOUN,ROOT
...,...,...,...,...,...,...,...,...,...,...
117,,,,,,,,event_summary,ADJ,npadvmod
118,,,,,,,,",",PUNCT,punct
119,,,,,,,,dtype,NOUN,ROOT
120,,,,,,,,:,PUNCT,punct


In [50]:
df1 = pd.DataFrame()
for i,token in enumerate(doc):
    df1.loc[i,'text'] = token.text

In [51]:
df1

Unnamed: 0,text
0,0
1,
2,the
3,death
4,toll
...,...
117,event_summary
118,","
119,dtype
120,:


In [52]:
df3 = pd.DataFrame()
for i,token in enumerate(doc):
    df3.loc[i,'text'] = token.text
    df3.loc[i,'lemma_'] = token.lemma_
    df3.loc[i,'pos_'] = token.pos_
    df3.loc[i,'tag_'] = token.tag_
    df3.loc[i,'dep_'] = token.dep_
    df3.loc[i,'shape_'] = token.shape_
    df3.loc[i,'is_alpha'] = token.is_alpha
    df3.loc[i,'is_stop'] = token.is_stop

In [53]:
df3

Unnamed: 0,text,lemma_,pos_,tag_,dep_,shape_,is_alpha,is_stop
0,0,0,PUNCT,NFP,nummod,d,False,False
1,,,SPACE,_SP,,,False,False
2,the,the,DET,DT,det,xxx,True,True
3,death,death,NOUN,NN,compound,xxxx,True,False
4,toll,toll,NOUN,NN,ROOT,xxxx,True,False
...,...,...,...,...,...,...,...,...
117,event_summary,event_summary,ADJ,JJ,npadvmod,xxxx_xxxx,False,False
118,",",",",PUNCT,",",punct,",",False,False
119,dtype,dtype,NOUN,NN,ROOT,xxxx,True,False
120,:,:,PUNCT,:,punct,:,False,False


In [54]:
spacy.displacy.render(doc, style='ent',jupyter=True)

In [55]:
spacy.displacy.render(doc, style='dep',jupyter=True,options = {'compact':60})

https://www.kaggle.com/ganeshn88/faster-nlp-with-spacy-in-python/code

In [56]:
# from __future__ import unicode_literals
# import spacy,en_core_web_sm
# import textacy
# from textacy import io

# nlp = en_core_web_sm.load()
# sentence = 'The author is writing a new book.'
# pattern = r'<VERB>?<ADV>*<VERB>+'
# doc = textacy.Doc.load(sentence, metadata=metadata, lang='en_core_web_sm')
# # doc = textacy.corpus.Corpus(sentence, lang='en_core_web_sm')
# lists = textacy.extract.pos_regex_matches(doc, pattern)
# for list in lists:
#     print(list.text)