In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.utils import resample
import xgboost as xgb 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import shap
import spacy


In [7]:
data_en = pd.read_pickle(r"D:\sep22_10_supply_chain\data" + r"\data_en2.pickle")

In [3]:

nlp = spacy.load("en_core_web_lg")

In [6]:
def lemmatize_and_pos_tag(review):
    doc = nlp(review)
    lst = []
    for tok in doc:
        if (tok.pos != 97 ) and (tok.pos_ != "SPACE") and (tok.is_alpha or tok.pos_== "PART") and (tok.ent_type_ == ''):
           lst.append(tok.lemma_ + "_" + tok.pos_)
    return " ".join(lst)
    

In [5]:
data_en.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date', 'language',
       'processed_reviews'],
      dtype='object')

In [9]:
data_en["head_lem_pos_ner_rem"] = data_en.review_headline.apply(lambda x: lemmatize_and_pos_tag(str(x))) 
data_en.to_pickle(r"D:\sep22_10_supply_chain\data\data_en3.pickle")

In [115]:
lemmatize_and_pos_tag(data_en.review_body[0])

'use_VERB this_PRON for_ADP on_ADP my_PRON mac_PROPN an_DET amazing_ADJ joystick_NOUN I_PRON especially_ADV love_VERB that_PRON you_PRON can_AUX twist_VERB the_DET stick_NOUN for_ADP different_ADJ movement_NOUN binding_NOUN as_ADV well_ADV as_ADP move_VERB it_PRON in_ADP the_DET normal_ADJ way_NOUN'

In [116]:
data_en["lem_pos_ner_rem"] = data_en.review_body.apply(lambda x: lemmatize_and_pos_tag(x)) 

In [None]:
data_en.to_pickle(r"D:\sep22_10_supply_chain\data\data_en2.pickle")

In [111]:
doc = nlp(data_en.review_body[0])
print(doc.sentiment)

0.0


In [107]:
data_en.review_body[1]

"Loved it,  I didn't even realise it was a gaming mouse,  I typed in &#34;silent mouse&#34; and selected this one. It is perfect and looks pretty cool as well. Now my boyfriend's gaming is wonderfully comfortably silent :) . Think I might just get one for myself."

In [112]:
spacy.displacy.render(doc,
                      style="ent",                      
                      jupyter=True)

In [113]:
tl = []
for token in doc:
    tl.append(token)
    print(token.text, token.pos_, token.dep_, token.ent_type_, token.lemma_)

Used VERB ROOT  use
this PRON dobj  this
for ADP prep  for
Elite PROPN compound ORG Elite
Dangerous PROPN pobj ORG Dangerous
on ADP prep  on
my PRON poss  my
mac PROPN pobj  mac
, PUNCT punct  ,
an DET det  an
amazing ADJ amod  amazing
joystick NOUN appos  joystick
. PUNCT punct  .
I PRON nsubj  I
especially ADV advmod  especially
love VERB ROOT  love
that PRON mark  that
you PRON nsubj  you
can AUX aux  can
twist VERB ccomp  twist
the DET det  the
stick NOUN dobj  stick
for ADP prep  for
different ADJ amod  different
movement NOUN compound  movement
bindings NOUN pobj  binding
as ADV advmod  as
well ADV advmod  well
as ADP cc  as
move VERB conj  move
it PRON dobj  it
in ADP prep  in
the DET det  the
normal ADJ amod  normal
way NOUN pobj  way
. PUNCT punct  .


In [51]:
a = tl[3]

doc.ents

(Elite Dangerous,)

In [52]:
dir(doc)


['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set

In [81]:
tl[0].ent_id_ == ''


True

In [96]:
test = nlp("this is great. I like it. This tastes like shit, I played Civilization a lot! Yes?")
for t in test:
    print(t.text + "_" + str(t.pos_) + " " + str(t.pos) + "_" + t.ent_id_ + " - ")

this_PRON 95_ - 
is_AUX 87_ - 
great_ADJ 84_ - 
._PUNCT 97_ - 
I_PRON 95_ - 
like_VERB 100_ - 
it_PRON 95_ - 
._PUNCT 97_ - 
This_PRON 95_ - 
tastes_VERB 100_ - 
like_ADP 85_ - 
shit_NOUN 92_ - 
,_PUNCT 97_ - 
I_PRON 95_ - 
played_VERB 100_ - 
Civilization_PROPN 96_ - 
a_DET 90_ - 
lot_NOUN 92_ - 
!_PUNCT 97_ - 
Yes_INTJ 91_ - 
?_PUNCT 97_ - 


In [57]:
for x in doc.sents:
    print(x)

Used this for Elite Dangerous on my mac, an amazing joystick.
I especially love that you can twist the stick for different movement bindings as well as move it in the normal way.


In [59]:
print(doc.text_with_ws)

Used this for Elite Dangerous on my mac, an amazing joystick. I especially love that you can twist the stick for different movement bindings as well as move it in the normal way.
