## POS tagging and NER
- Use spacy to do POS tagging
- Case Study to extract product aspects
- Use spacy to do NER tagging
- Case Study to redact names from emails

In [1]:
import pandas as pd
import numpy as np
path = "./data/samsung.txt"
con=open(path,"r",encoding="utf-8")
reviews=con.read()
review_list=reviews.split("\n")

In [2]:
len(review_list)

46355

In [3]:
review_list[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [4]:
#### POS extraction #####
import spacy ### pre-trained classifier that can classify pos

In [5]:
nlp=spacy.load("en_core_web_sm") ### lemmatization, pos, NER, grammatical parsing

In [6]:
doc1=nlp(review_list[0])

In [7]:
for token in doc1:
    if token.pos_=="NOUN":
        print(token.text)

phone
phone
line
one
one
years
upgrade
honesty
phone
seller


In [8]:
#### 1000 reviews what are the common nouns:
nouns=[]
for review in review_list[0:100]:
    doc=nlp(review)
    for token in doc:
        if token.pos_=="NOUN":
            nouns.append(token.lemma_)

In [9]:
pd.Series(nouns).value_counts()

phone        107
price         12
problem       11
condition     10
seller         9
            ... 
pic            1
mode           1
host           1
iso            1
breaking       1
Length: 235, dtype: int64

In [10]:
#### phone/phones?
#### battery/batteries
#### root form of a word <===> lemma/lemmatized form

In [11]:
for token in doc1:
    print(token.text,token.lemma_,token.pos_)

I I PRON
feel feel VERB
so so ADV
LUCKY lucky ADJ
to to PART
have have AUX
found find VERB
this this DET
used use VERB
( ( PUNCT
phone phone NOUN
to to ADP
us we PRON
& & CCONJ
not not PART
used use VERB
hard hard ADV
at at ADV
all all ADV
) ) PUNCT
, , PUNCT
phone phone NOUN
on on ADP
line line NOUN
from from ADP
someone someone PRON
who who PRON
upgraded upgrade VERB
and and CCONJ
sold sell VERB
this this DET
one one NOUN
. . PUNCT
My my PRON
Son Son PROPN
liked like VERB
his his PRON
old old ADJ
one one NOUN
that that PRON
finally finally ADV
fell fall VERB
apart apart ADV
after after ADP
2.5 2.5 NUM
+ + NUM
years year NOUN
and and CCONJ
did do AUX
n't not PART
want want VERB
an an DET
upgrade upgrade NOUN
! ! PUNCT
! ! PUNCT
Thank thank VERB
you you PRON
Seller Seller PROPN
, , PUNCT
we we PRON
really really ADV
appreciate appreciate VERB
it it PRON
& & CCONJ
your your PRON
honesty honesty NOUN
re re ADP
: : PUNCT
said say VERB
used used ADJ
phone phone NOUN
. . PUNCT
I I PRON
reco

In [12]:
from tqdm import tqdm

In [13]:
### We will make our code mutithreaded
### Load relevant models
nlp=spacy.load("en_core_web_sm",disable=["parser","ner"])

In [14]:
nouns=[]
for doc in tqdm(nlp.pipe(review_list,batch_size=1,n_process=-1)):
    for token in doc:
        if token.pos_=="NOUN":
            nouns.append(token.lemma_)

46355it [01:02, 740.66it/s] 


In [15]:
pd.Series(nouns).value_counts()

phone        42945
battery       4261
product       3894
screen        3851
time          3817
             ...  
amazed           1
simbetter        1
telefeno         1
soccer           1
gun              1
Length: 8461, dtype: int64

In [16]:
##### What are the things that people mention about the features
##### What are the most common words that occur before or after the aspect word.

In [17]:
import re 

In [18]:
pattern=re.compile("\w+\sphone\s\w+")

In [19]:
sent="This phone is awesome"

In [20]:
re.findall(pattern,sent)

['This phone is']

In [21]:
prefix_suffix=re.findall(pattern,reviews.replace("\n"," "))

In [22]:
prefix_suffix[0]

'android phone but'

In [23]:
prefix_suffix[0].split(" ")[0]

'android'

In [24]:
prefix_suffix[0].split(" ")[-1]

'but'

In [25]:
prefixes=[i.split()[0].lower() for i in prefix_suffix]

In [26]:
stop_words=["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren",
            "aren't","as","at","be","because","been","before","being","below","between","both","but","by",
            "can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don",
            "don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn",
            "hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his",
            "how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me",
            "mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not",
            "now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own",
            "re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some",
            "such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these",
            "they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't",
            "we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with",
            "won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours",
            "yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've",
            "let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd",
            "we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst",
            "accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah",
            "almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore",
            "anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth",
            "available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins",
            "behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain",
            "certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed",
            "edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every",
            "everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following",
            "follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving",
            "go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi",
            "hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed",
            "index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l",
            "largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll",
            "look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg",
            "might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly",
            "necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none",
            "nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old",
            "omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular",
            "particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly",
            "present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather",
            "rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research",
            "respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing",
            "seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]

In [27]:
prefix=[word for word in prefixes if not word in stop_words]

In [28]:
pd.Series(prefix).value_counts() ## common words

great       1368
good         655
cell         485
smart        352
nice         334
            ... 
largest        1
rubber         1
mail           1
minutes        1
outgoing       1
Length: 593, dtype: int64

In [29]:
def get_prefixes(keyword):
    pattern=re.compile(r"\w+\s{}\s\w+".format(keyword))
    prefix_suffix=re.findall(pattern,reviews.replace("\n"," "))
    prefixes=[i.split()[0].lower() for i in prefix_suffix]
    prefix=[word for word in prefixes if not word in stop_words]
    prefix=pd.Series(prefix).value_counts().head(5).index
    result=pd.DataFrame({'prefix':prefix})
    result['keyword']=keyword
    return result

In [30]:
get_prefixes("screen")

Unnamed: 0,prefix,keyword
0,touch,screen
1,big,screen
2,great,screen
3,large,screen
4,bigger,screen


In [31]:
def get_suffixes(keyword):
    pattern=re.compile(r"\w+\s{}\s\w+".format(keyword))
    prefix_suffix=re.findall(pattern,reviews.replace("\n"," "))
    suffixes=[i.split()[-1].lower() for i in prefix_suffix]
    suffix=[word for word in suffixes if not word in stop_words]
    suffix=pd.Series(suffix).value_counts().head(5).index
    result=pd.DataFrame({'suffix':suffix})
    result['keyword']=keyword
    result=result[['keyword','suffix']]
    return result

In [32]:
get_suffixes("battery")

Unnamed: 0,keyword,suffix
0,battery,life
1,battery,lasts
2,battery,runs
3,battery,drains
4,battery,charge


In [35]:
### NER tagging
nlp=spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [36]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


Data [link](https://drive.google.com/file/d/1oGoJrXYcBrNkAvWXmDkwtHJkZTAFb5lN/view?usp=sharing)

In [51]:
with open('./data/data/4625.txt','r') as reader:
    text = reader.read()

In [52]:
print(text)

Karen,

Thank you for the update.  It looks like we'll plan on having the EBS/Avaya 
meetings on January 10th and 11th, 2001.  The first day will be a full day, 
the second will be 1/2 day, a.m. session.  You have asked me to provide a 
list of Enron attendees, titles, which day(s) they would likely attend, and 
some background information on the meeting(s) purposes.  An explanation of 
the meetings' proposed focus and probable attendees is in the attached 
meeting notes.  

The notes are from the November meeting which we coordinated and held for 
Enron Broadband Services and Dave Johnson.  By copy of this note to Kim 
Godfrey, we'll update the EBS executives on the meetings, and work on 
arranging their calendar availability.  So far, we have had the EBS execs' 
calendars penciled in for the time slot of January 9-11.  At this point, I 
would expect that the EBS attendee list would look something like this:  

Jim Crowder, VP, Enterprise Services; Enron Broadband Services - day 2 
Ev

In [53]:
doc = nlp(text)

In [54]:
start,end,instances = [],[],[]
for ent in doc.ents:
    #print(ent.text, ent.start_char, ent.end_char, ent.label_)
    if ent.label_=='PERSON':
        start.append(ent.start_char)
        end.append(ent.end_char)
        instances.append(ent.text)    

In [55]:
for instance in instances:
    text = text.replace(instance,"<redacted>")

In [56]:
print(text)

<redacted>,

Thank you for the update.  It looks like we'll plan on having the EBS/Avaya 
meetings on January 10th and 11th, 2001.  The first day will be a full day, 
the second will be 1/2 day, a.m. session.  You have asked me to provide a 
list of Enron attendees, titles, which day(s) they would likely attend, and 
some background information on the meeting(s) purposes.  An explanation of 
the meetings' proposed focus and probable attendees is in the attached 
meeting notes.  

The notes are from the November meeting which we coordinated and held for 
Enron Broadband Services and <redacted>.  By copy of this note to <redacted>, we'll update the EBS executives on the meetings, and work on 
arranging their calendar availability.  So far, we have had the EBS execs' 
calendars penciled in for the time slot of January 9-11.  At this point, I 
would expect that the EBS attendee list would look something like this:  

<redacted>, VP, Enterprise Services; Enron Broadband Services - day 2 
<r

### Topic Modelling NMF

In [58]:
review_list[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [59]:
from tqdm import tqdm
processed_reviews=[]
for review in tqdm(review_list):
    doc=nlp(review)
    one_review=[]
    for word in doc:
        if word.lemma_!='-PRON-':
            one_review.append(word.lemma_)
    review_processed=" ".join(one_review)
    processed_reviews.append(review_processed)

100%|████████████████████████████████████| 46355/46355 [05:34<00:00, 138.46it/s]


In [61]:
### create a tfidf representation 
from sklearn.feature_extraction import text
tfidf=text.TfidfVectorizer(input = processed_reviews,stop_words='english')
tfidf_matrix=tfidf.fit_transform(processed_reviews)
tfidf_matrix.shape

(46355, 15942)

In [62]:
from sklearn.decomposition import NMF

In [63]:
mod=NMF(n_components=4)

In [64]:
%%time
A=mod.fit_transform(tfidf_matrix)



CPU times: user 860 ms, sys: 590 ms, total: 1.45 s
Wall time: 371 ms


In [65]:
A.shape ###

(46355, 4)

In [66]:
B=mod.components_###

In [67]:
B.shape

(4, 15942)

In [69]:
import numpy as np
tokens=np.array(tfidf.get_feature_names_out())
for row in B:
    idx=row.argsort()[-6:-1][::-1]
    print(tokens[idx])

['phone' 'product' 'price' 'work' 'far']
['phone' 'work' 'price' 'product' 'use']
['product' 'recommend' 'thank' 'phone' 'seller']
['phone' 'new' 'perfect' 'buy' 'use']


In [72]:
### Lets extract nouns ###
from tqdm import tqdm
processed_reviews=[]
nlp=spacy.load("en_core_web_sm",disable=["parser","ner"])
for review in tqdm(review_list):
    doc=nlp(review)
    one_review=[]
    for word in doc:
        if word.pos_=='NOUN':
            one_review.append(word.lemma_)
    review_processed=" ".join(one_review)
    processed_reviews.append(review_processed)

100%|████████████████████████████████████| 46355/46355 [02:24<00:00, 320.54it/s]


In [73]:
processed_reviews[0]

'phone phone line one one year upgrade honesty phone seller'

In [75]:
#### What topics we can see if we use only nouns ####
tfidf=text.TfidfVectorizer(input=processed_reviews,stop_words='english')
tfidf_matrix=tfidf.fit_transform(processed_reviews)

In [76]:
mod=NMF(n_components=4)

In [77]:
%%time
A=mod.fit_transform(tfidf_matrix)

CPU times: user 459 ms, sys: 325 ms, total: 785 ms
Wall time: 153 ms




In [78]:
A.shape ###

(46355, 4)

In [79]:
B=mod.components_###

In [80]:
B.shape

(4, 7479)

In [81]:
tokens=np.array(tfidf.get_feature_names_out())
for row in B:
    idx=row.argsort()[-6:-1][::-1]
    print(tokens[idx])

['price' 'problem' 'time' 'battery' 'screen']
['time' 'seller' 'price' 'service' 'quality']
['producto' 'celular' 'recomendado' 'fono' 'price']
['thank' 'condition' 'cellphone' 'seller' 'work']


![](regex.png)