In [1]:
import spacy
import pandas as pd
import numpy as np

In [2]:
text = "The battery life on this phone is really good."

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
doc = nlp(text)

In [5]:
for word in doc:
    print(f"Text:{word.text}, POS:{word.pos_}")

Text:The, POS:DET
Text:battery, POS:NOUN
Text:life, POS:NOUN
Text:on, POS:ADP
Text:this, POS:DET
Text:phone, POS:NOUN
Text:is, POS:AUX
Text:really, POS:ADV
Text:good, POS:ADJ
Text:., POS:PUNCT


In [17]:
### Extract all the nouns used in a corpus of comment data
with open("../codes/data/samsung.txt","r",encoding='utf-8') as r:
    review_text = r.read()

In [18]:
review_text[0:300]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said us"

In [19]:
reviews = review_text.split("\n")

In [20]:
reviews[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [21]:
len(reviews)

46355

In [22]:
def print_pos(doc):
    for word in doc:
        print(f"Text:{word.text}, POS:{word.pos_}")

In [23]:
doc = nlp(reviews[0])

In [29]:
from tqdm import tqdm
nouns = []
for review in tqdm(reviews[0:10000]):
    doc = nlp(review)
    for word in doc:
        if word.pos_=="NOUN":
            nouns.append(word.lemma_)           

100%|████████████████████████████████| 10000/10000 [01:15<00:00, 132.99it/s]


## Reasons we are slow
- We are processing each word on a single thread, if we process on multiple threads
- We are also running predictions from models that we don't need (NER, Dependency Parse)

In [30]:
pd.Series(nouns).value_counts().head(10)

phone      10774
price       1174
battery      958
card         952
time         926
screen       903
camera       845
app          836
product      747
problem      671
dtype: int64

In [31]:
### Disable certain models in spacy
nlp = spacy.load("en_core_web_sm",disable = ['ner','parser'])

In [32]:
nouns = []
for review in tqdm(reviews[0:10000]):
    doc = nlp(review)
    for word in doc:
        if word.pos_=="NOUN":
            nouns.append(word.lemma_)    

100%|████████████████████████████████| 10000/10000 [00:32<00:00, 310.15it/s]


In [37]:
nouns = []
for doc in tqdm(nlp.pipe(reviews[0:10000],batch_size=1,n_process=-1),total=10000):
    for word in doc:
        if word.pos_=="NOUN":
            nouns.append(word.lemma_)

100%|████████████████████████████████| 10000/10000 [00:18<00:00, 551.05it/s]


In [38]:
nouns = []
for doc in tqdm(nlp.pipe(reviews,batch_size=1,n_process=-1),total=len(reviews)):
    for word in doc:
        if word.pos_=="NOUN":
            nouns.append(word.lemma_)

100%|████████████████████████████████| 46355/46355 [01:05<00:00, 705.03it/s]


In [40]:
pd.Series(nouns).value_counts().head(10)

phone      42945
battery     4261
product     3894
screen      3851
time        3817
card        3376
price       3142
problem     3137
camera      2918
app         2500
dtype: int64

In [None]:
### Now that we have the product features, we would want to know in which 
## context these features are being talked about?
"The battery life is bad"
"The battery life is awesome"
"The screen resolution is good"
### Find out the prefixes and suffixes to each keyword and then we will find out which are the most 
## freq prefixes or suffixes
# How will you find prefixes and suffixes (Hint: Read about regular expressions)