In [1]:
import spacy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
nlp = spacy.load('en_core_web_sm')

doc = nlp("The 22-year-old recently won ATP Challenger tournament.")

for tok in doc:
    print(tok.text, "...", tok.dep_)

The ... det
22 ... nummod
- ... punct
year ... npadvmod
- ... punct
old ... nsubj
recently ... advmod
won ... ROOT
ATP ... compound
Challenger ... compound
tournament ... dobj
. ... punct


In [3]:
!pip install beautifulsoup4





In [4]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [5]:
# import wikipedia sentences
candidate_sentences = pd.read_csv("2_Preprocessed.csv")
candidate_sentences.shape

(30, 1)

In [6]:
candidate_sentences

Unnamed: 0,sentence
0,rabi lamichhane wins chitwan 2 54276 votes 43025 closest rival rabi lamichhane wins chitwan 2 54276 votes 43025 closest rival rastriya swatantra party president rabi lamichhane byelection chitwan ...
1,per officials lamichhane received 54276 votes 43025 closest rival jit narayan shrestha nepali congress .
2,ram prasad neupane cpnuml third order got 10936 votes .
3,november elections lamichhane received 49300 votes 14988 votes received umesh shrestha nepali congress .
4,supreme court revoked house membership citizenship legitimacy issue lamichhane forced obtain back citizenship prepare byelection .
5,rabi lamichhane swarnim wagle take oath lawmakers upendra yadav absent rabi lamichhane swarnim wagle take oath lawmakers upendra yadav absent two three lawmakers elected recent byelections–rabi la...
6,lamichhane wagle leaders rastriya swatantra party elected chitwan 2 tanahun 1 respectively sworn speaker dev raj ghimire function organised kathmandu friday morning .
7,third lawmaker upendra yadav chairman janata samajbadi party nepal elected bara 2 absent .
8,meanwhile rsp president lamichhane said party would join government present circumstances .
9,subscribe onlinekhabar english get notified exclusive news stories .


In [7]:
doc = nlp("confused and frustrated, connie decides to leave on her own.")

for tok in doc:
    print(tok.text, "...", tok.dep_)

confused ... advcl
and ... cc
frustrated ... conj
, ... punct
connie ... nsubj
decides ... ROOT
to ... aux
leave ... xcomp
on ... prep
her ... poss
own ... pobj
. ... punct


In [8]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""  # dependency tag of previous token in the sentence
    prv_tok_text = ""  # previous token in the sentence

    prefix = ""
    modifier = ""

    #############################################################

    for tok in nlp(sent):
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text

            # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " " + tok.text

            ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier + " " + prefix + " " + tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""

                ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier + " " + prefix + " " + tok.text

            ## chunk 5  
            # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
    #############################################################

    return [ent1.strip(), ent2.strip()]

In [9]:
get_entities("the film had 200 patents")

['film', '200  patents']

In [10]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
    entity_pairs.append(get_entities(i))

100%|██████████| 30/30 [00:00<00:00, 192.25it/s]


In [11]:
entity_pairs[0:50]

[['increased number votes', 'closest  rival'],
 ['officials lamichhane', 'rival shrestha nepali congress'],
 ['ram', '10936 neupane cpnuml votes'],
 ['14988  votes', 'umesh shrestha'],
 ['back  citizenship', 'byelection'],
 ['three oath lawmakers', 'recent members house representatives'],
 ['rastriya swatantra party', 'organised ghimire kathmandu'],
 ['third samajbadi party nepal', 'bara'],
 ['party', 'present  circumstances'],
 ['', 'exclusive news stories'],
 ['political  party', 'former journalist rabi lamichhane'],
 ['proposed name party commission', ''],
 ['constitutional  body', 'registration certificate lamichhane'],
 ['independent  party', 'circle election symbol'],
 ['political  party', ''],
 ['far  chance', 'public things party'],
 ['sharing', 'them'],
 ['', 'exclusive news stories'],
 ['independent rastriya sabha', 'formation'],
 ['', ''],
 ['lamichhane', 'parliamentary  elections'],
 ['', 'independent  party'],
 ['form party', 'representational votes country'],
 ['popular p

In [12]:
def get_relation(sent):

    doc = nlp(sent)

    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    #define the pattern 
    pattern = [{'DEP':'ROOT'},
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1", [pattern]) 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [13]:
get_relation("John completed the task")

'completed'

In [14]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

100%|██████████| 30/30 [00:00<00:00, 189.76it/s]


In [15]:
pd.Series(relations).value_counts()[:50]

said                  3
notified exclusive    3
says                  2
elected               2
received              2
continue              1
announce              1
kathmandu             1
enter                 1
urged independent     1
say                   1
bell                  1
announces nepal       1
obtained              1
go                    1
registered            1
awarded               1
approved              1
gives                 1
take                  1
revoked               1
got                   1
resigned              1
dtype: int64

In [16]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
kg_df.to_csv('3_kg_df.csv')
kg_df

Unnamed: 0,source,target,edge
0,increased number votes,closest rival,obtained
1,officials lamichhane,rival shrestha nepali congress,received
2,ram,10936 neupane cpnuml votes,got
3,14988 votes,umesh shrestha,received
4,back citizenship,byelection,revoked
5,three oath lawmakers,recent members house representatives,take
6,rastriya swatantra party,organised ghimire kathmandu,elected
7,third samajbadi party nepal,bara,elected
8,party,present circumstances,said
9,,exclusive news stories,notified exclusive
