<a href="https://colab.research.google.com/github/JishnuJayaraj/ML/blob/master/NLP/NamedEntityrecognition/Spacy/SpacyNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SPACY NER TRAINING

[sample code](https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718)

[sample tutorial](https://lvngd.com/blog/how-to-train-a-custom-named-entity-recognizer-with-spacy/)

[sample 2](https://medium.com/swlh/python-nlp-tutorial-information-extraction-and-knowledge-graphs-43a2a4c4556c)
[spacy documentation](https://spacy.io/usage/training)

To extract named entities, you pass a piece of text to the NER model and it looks at each word and tries to predict whether the word fits into a named entity category such as person, location, organization, etc.

Problems arise when the text data you're trying to label is too different(yes, very subjective) than the text data that was used to train the Named Entity Recognizer you're using, and it might not be very good at labeling your data.

### Training data in JSON format
TRAIN_DATA = [
        ("Uber blew through $1 million a week", 
                    {"entities": 
                        [(0, 4, "ORG")]
                        }
                    ),
        ("Google rebrands its business apps", 
                    {"entities": 
                        [(0, 6, "ORG")]
                        }
                    )           
                ]

### save some for test data

[sapcy for triplet: kaggle](https://www.kaggle.com/shivamb/spacy-text-meta-features-knowledge-graphs)

In [None]:
!python -m spacy download en_core_web_md
import spacy

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import pandas as pd
import sklearn

import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# load data
orig = pd.read_json('/content/drive/My Drive/RokinData/ToBeCleaned.json.gz')
orig['word_count'] = orig['text'].apply(lambda x: len(str(x).split(" ")))
orig['content_len'] = orig['text'].astype(str).apply(len)
orig.sample(5)
df = orig.sample(100000)


# Language detection

In [None]:
!pip install spacy_cld
import spacy
from spacy_cld import LanguageDetector
nlp = spacy.load('en')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

#sample
doc = nlp('This is some English text.')
doc._.languages  # ['en']
doc._.language_scores['en']  # 0.96

tweets          = df['text']
languages_spacy = []

for e in tweets:
    doc = nlp(e)
    # cheking if the doc._.languages is not empty
    # then appending the first detected language in a list
    if(doc._.languages):
        languages_spacy.append(doc._.languages[0])
    # if it is empty, we append the list by unknown
    else:
        languages_spacy.append('unknown')

df['languages_spacy'] = languages_spacy
#df['languages_langdetect'] = languages_langdetect

In [None]:
df['languages_spacy'].value_counts()

In [None]:
de=df.loc[df['languages_spacy'] == 'sk']
de.head()

Unnamed: 0,date,title,text,url,lastCrawlTimeUTC,word_count,content_len,languages_spacy
399616,1575432000000,Na trh východnej Európy prichádza kompletný so...,"GUANGZHOU, Čína, 4. decembra 2019 /PRNewswire/...",https://www.prnewswire.com/news-releases/na-tr...,1589863905,334,2416,sk


# term freq

## unigram

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stops =  set(stopwords.words('english')+['com'])
co = CountVectorizer(stop_words=stops)
counts = co.fit_transform(df.text)
table1= pd.DataFrame(counts.sum(axis=0),columns=co.get_feature_names()).T.sort_values(0,ascending=False).head(50)

In [None]:
# most frequent words in the data, extracting information about its content and topics.

# bar chart with custom regex
# data.Tweet_Text.str.extractall(r'(\#\w+)')[0].value_counts().head(20).plot.bar()
table1

## Bigram

In [None]:
co = CountVectorizer(ngram_range=(2,2),stop_words=stops)
counts = co.fit_transform(df.text)
table2=pd.DataFrame(counts.sum(axis=0),columns=co.get_feature_names()).T.sort_values(0,ascending=False).head(50)
table2

## topic modelling LDA

In [None]:
#LDA
from sklearn.decomposition import LatentDirichletAllocation, NMF
vectorizer = CountVectorizer(stop_words=stops)
model = vectorizer.fit(df.text)
docs = vectorizer.transform(df.text)
lda = LatentDirichletAllocation(20)
lda.fit(docs)
def print_top_words(model, feature_names, n_top_words):
  for topic_idx, topic in enumerate(model.components_):
    message = "Topic #%d: " % topic_idx
    message += " ".join([(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]])
    print(message)
  print()
print_top_words(lda,vectorizer.get_feature_names(),10)

## LDA2

In [None]:
#LDA 2 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import LatentDirichletAllocation

df['tokenized'] = df['text'].apply(word_tokenize)
df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])

import string
punc = string.punctuation
df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
stop_words = set(stopwords.words('english'))
df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df['pos_tags'] = df['stopwords_removed'].apply(nltk.tag.pos_tag)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
df['wordnet_pos'] = df['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

wnl = WordNetLemmatizer()
df['lemmatized'] = df['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

df['lemma_str'] = [' '.join(map(str,l)) for l in df['lemmatized']]

# add df.drop()


tf_vectorizer = CountVectorizer(max_df=0.9, min_df=25, max_features=5000)
tf = tf_vectorizer.fit_transform(df['lemma_str'].values.astype('U'))
tf_feature_names = tf_vectorizer.get_feature_names()
doc_term_matrix = pd.DataFrame(tf.toarray(), columns=list(tf_feature_names))

lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=500, random_state=0).fit(tf)
no_top_words = 10
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                          for i in topic.argsort()[:-no_top_words - 1:-1]]))
              
display_topics(lda_model, tf_feature_names, no_top_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Topic 0:
use data system software user application device network support design
Topic 1:
de die la und der en el für que para
Topic 2:
use say material researcher research new university could light make
Topic 3:
technology company customer product solution service business industry system new
Topic 4:
year company say robot new percent business 000 manufacturing employee
Topic 5:
say make one get use like work see time go
Topic 6:
market company million 2019 report table global 2020 share growth
Topic 7:
design machine part material tool use 3d new metal company
Topic 8:
power 

### LDA vis

In [None]:
!pip install pyldavis
import pyLDAvis
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, tf, tf_vectorizer, mds='tsne')
panel

## TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df =25, max_features=5000, use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(df['lemma_str'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
doc_term_matrix_tfidf = pd.DataFrame(tfidf.toarray(), columns=list(tfidf_feature_names))

#NMF
nmf = NMF(n_components=10, random_state=0, alpha=.1, init='nndsvd').fit(tfidf)
display_topics(nmf, tfidf_feature_names, no_top_words)

# NER

https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

preprocessing
[simpple text processing](https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/)



In [None]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")

In [None]:
# Sample
doc=nlp('India and Iran have agreed to boost the economic viability \
of the strategic Chabahar port through various measures, \
including larger subsidies to merchant shipping firms using the facility, \
people familiar with the development said on Thursday.')

[(x.text,x.label_) for x in doc.ents]

![alt text](https://i2.wp.com/neptune.ai/wp-content/uploads/spacy_ner.png?w=647&ssl=1)

In [None]:
#sample visualization
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [None]:
# Run NER on our data
from collections import Counter 

df = orig.head(10000)
def ner(text):
    doc=nlp(text)
    return [X.label_ for X in doc.ents]

ent=df['text'].apply(lambda x : ner(x))
ent=[x for sub in ent for x in sub]

counter=Counter(ent)
count=counter.most_common()

In [None]:
import seaborn as sns
# visualize entity freq
x,y=map(list,zip(*count))
sns.barplot(x=y,y=x)

In [None]:
# check which specified entity occur the most
def ner(text,ent="ORG"):
    doc=nlp(text)
    return [X.text for X in doc.ents if X.label_ == ent]

gpe=df['text'].apply(lambda x: ner(x))
gpe=[i for x in gpe for i in x]
counter=Counter(gpe)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)

In [None]:
# person
per=df['text'].apply(lambda x: ner(x,"PERSON"))
per=[i for x in per for i in x]
counter=Counter(per)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)

In [None]:
# many product might go u noticed
per=df['text'].apply(lambda x: ner(x,"PRODUCT"))
per=[i for x in per for i in x]
counter=Counter(per)

x,y=map(list,zip(*counter.most_common(10)))
sns.barplot(y,x)

# POS tagging

Noun (NN)- Joseph, London, table, cat, teacher, pen, city

Verb (VB)- read, speak, run, eat, play, live, walk, have, like, are, is

Adjective(JJ)- beautiful, happy, sad, young, fun, three

Adverb(RB)- slowly, quietly, very, always, never, too, well, tomorrow

Preposition (IN)- at, on, in, from, with, near, between, about, under

Conjunction (CC)- and, or, but, because, so, yet, unless, since, if

Pronoun(PRP)- I, you, we, they, he, she, it, me, us, them, him, her, this

Interjection (INT)- Ouch! Wow! Great! Help! Oh! Hey! Hi!

In [None]:
doc = nlp('The greatest comeback stories in 2019')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize

In [None]:
# Run POS on title
def pos(text):
    pos=nltk.pos_tag(word_tokenize(text))
    pos=list(map(list,zip(*pos)))[1]
    return pos

tags=df['title'].apply(lambda x : pos(x))
tags=[x for l in tags for x in l]
counter=Counter(tags)

x,y=list(map(list,zip(*counter.most_common(7))))
sns.barplot(x=y,y=x)

In [None]:
# which singular noun dominates
def get_adjs(text):
    adj=[]
    pos=nltk.pos_tag(word_tokenize(text))
    for word,tag in pos:
        if tag=='NN':
            adj.append(word)
    return adj


words=df['title'].apply(lambda x : get_adjs(x))
words=[x for l in words for x in l]
counter=Counter(words)

x,y=list(map(list,zip(*counter.most_common(7))))
sns.barplot(x=y,y=x)

## Person name detection

In [None]:
# Extracting names usig spacy
!pip3 install spacy
import spacy
from spacy.tokens import Span
!pip install dateparser
!python3 -m spacy download en_core_web_md
!python -m spacy download en_core_web_sm
import en_core_web_md
import dateparser

def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        # Only check for title if it's a person and not the first token
        if ent.label_ == "PERSON":
            if ent.start != 0:
                # if person preceded by title, include title in entity
                prev_token = doc[ent.start - 1]
                if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                    new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                    new_ents.append(new_ent)
                else:
                    # if entity can be parsed as a date, it's not a person
                    if dateparser.parse(ent.text) is None:
                        new_ents.append(ent) 
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc




# Add the component after the named entity recognizer
# nlp.remove_pipe('expand_person_entities')
nlp = spacy.load('en_core_web_md') #nlp = spacy.load('en') #nlp = en_core_web_md.load()  #nlp = spacy.load('en_core_web_md')  
nlp.add_pipe(expand_person_entities, after='ner')

document_string = 'Jishnu jayaraj was a great person and still is. Mr. Jayaraj was well known for good deeds. Dr. Jishnu Jayaraj and team worked good but Dr. Jayden Green Olivia team failed so bad to him  '
doc = nlp(document_string)
[(ent.text, ent.label_) for ent in text.ents if ent.label_=='PERSON']
