# [Access the article here](https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/)

In [1]:
# install spaCy
!pip install spacy
!python -m spacy download en

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 19.0MB/s ta 0:00:011
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/joeai/miniconda3/lib/python3.6/site-packages/en_core_web_sm -->
/Users/joeai/miniconda3/lib/python3.6/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [2]:
# word tokenization
from spacy.lang.en import English

# load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

# 'nlp' Object is used to create documents with linguistic annotations
my_doc = nlp(text)

# create list of word toeksn
token_list = []
for token in my_doc:
    token_list.append(token.text)
    
print(token_list)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


In [3]:
# sentence tokenization

# load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

# 'nlp' Object is used to create documents with linguistic annotations
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
    
print(sents_list)

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


# Cleaning Text Data: Removing Stopwords

In [5]:
# stop words
# importing stop words from English language
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

# print the totla number of stop words
print('Number of stop words: %d' % len(spacy_stopwords))

# print first ten stop words
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 312
First ten stop words: ['eleven', 'either', 'everything', 'your', 'herein', 'sometime', 'whither', 'upon', 'other', 'next', 'down', 'is', 'the', 'well', 'amongst', 'one', 'for', 'ourselves', 'she', 'therein']


# Removing Stopwords from Our Data

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

# implementation of stop words
filtered_sent = []

# 'nlp' Object is used to create documents with linguistic annotations
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop == False:
        filtered_sent.append(word)

print('Filtered Sentence: ', filtered_sent)

Filtered Sentence:  [learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]


# Lemmatization

In [7]:
# implementing lemmatization
lem = nlp('run runs running runner')

# finding lemma for each word
for word in lem:
    print(word.text, word.lemma_)

run run
runs run
running run
runner runner


# Part of Speech (POS) Tagging

In [8]:
# POS tagging
# import the model en_core_web_sm of English for vocab, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocab, syntax & entities
nlp = en_core_web_sm.load()

# 'nlp' Object is used to create documents with linguistic annotations
docs = nlp(u'All is well that ends well.')

In [9]:
for word in docs:
    print(word.text, word.pos_)

All DET
is VERB
well ADV
that DET
ends VERB
well ADV
. PUNCT


# Entity Detection

In [12]:
from spacy import displacy

nytimes = nlp(u"""New York City on Tuesday declared a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to inoculate against dangerous diseases. 
At least 285 people have contracted measles in the city since September, mostly in Brooklyn’s Williamsburg neighborhood. The order covers four Zip codes there, Mayor Bill de Blasio (D) said Tuesday. 
The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive inoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1,000.""")

entities = [(i, i.label_, i.label) for i in nytimes.ents]

entities

[(New York City, 'GPE', 384),
 (Tuesday, 'DATE', 391),
 (At least 285, 'CARDINAL', 397),
 (September, 'DATE', 391),
 (Brooklyn, 'GPE', 384),
 (Williamsburg, 'GPE', 384),
 (four, 'CARDINAL', 397),
 (Bill de Blasio, 'PERSON', 380),
 (Tuesday, 'DATE', 391),
 (Orthodox Jews, 'NORP', 381),
 (6 months old, 'DATE', 391),
 (up to $1,000, 'MONEY', 394)]

In [13]:
displacy.render(nytimes, style = 'ent', jupyter = True)

# Dependency Parsing

- allows us to better determine the meaning of a sentence by analyzing how it's constructed to determine how the individual words relate to each other

In [14]:
docp = nlp('In pursuit of a wall, President Trump ran into one.')

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


The above output is a little hard to follow but we can use `displaCy` visualizer to view a dependency diagram that'll make it easier to understand.

In [15]:
displacy.render(docp, style='dep', jupyter=True)

# Word Vector Representation

- numeric representation of a word that communicates its relationship with other words
- each word is interpreted as unique and lengthy array of numbers

In [16]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 1.0466383  -1.5323697  -0.72177905 -2.4700649  -0.2715162   1.1589639
  1.7113379  -0.31615403 -2.0978343   1.837553    1.4681302   2.728043
 -2.3457408  -5.17184    -4.6110015  -0.21236466 -0.3029521   4.220028
 -0.6813917   2.4016762  -1.9546705  -0.85086954  1.2456163   1.5107994
  0.4684736   3.1612053   0.15542296  2.0598564   3.780035    4.6110964
  0.6375268  -1.078107   -0.96647096 -1.3939928  -0.56914186  0.51434743
  2.3150034  -0.93199825 -2.7970662  -0.8540115  -3.4250052   4.2857723
  2.5058174  -2.2150877   0.7860181   3.496335   -0.62606215 -2.0213525
 -4.47421     1.6821622  -6.0789204   0.22800982 -0.36950028 -4.5340714
 -1.7978683  -2.080299    4.125556    3.1852438  -3.286446    1.0892276
  1.017115    1.2736416  -0.10613725  3.5102775   1.1902348   0.05483437
 -0.06298041  0.8280688   0.05514218  0.94817173 -0.49377063  1.1512338
 -0.81374085 -1.6104267   1.8233354  -2.278403   -2.1321895   0.3029334
 -1.4510616  -1.0584296  -3.5698352  -0.13046083 -0.266833

- no way a human could look at this array and identify it as meaning 'mango
    - works well for machines though
    - allwos us to represent both word's meaning and its proximity to similiar words using the coordinates in the array

# Text Classification Walkthrough w/ Amazon Alexa product reviews

In [18]:
# importin libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

### Loading data

- data is tab-separated file (.tsv)
- has five columns
    - `rating`: denotes rating user gave the Alexa (out of 5)
    - `date`: date of the review
    - `variation`: describes model the user reviewed
    - `verified_reviews`: contains the text of each review
    - `feedback`: contains a sentiment label (1 - indicates positive sentiment, 0 - indicates negative sentiment)
- Goal: produce an accurate model that we can use to process new user reviews and quickly determine whether they are positive/negative

In [19]:
# loading TSV file
df_amazon = pd.read_csv('amazon_alexa.tsv', sep='\t')

In [20]:
# top 5 records
df_amazon.head(5)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [21]:
# shape of dataframe
df_amazon.shape

(3150, 5)

In [22]:
# view dataframe info
df_amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
rating              3150 non-null int64
date                3150 non-null object
variation           3150 non-null object
verified_reviews    3150 non-null object
feedback            3150 non-null int64
dtypes: int64(2), object(3)
memory usage: 123.1+ KB


In [23]:
# feedback value count
df_amazon.feedback.value_counts()

1    2893
0     257
Name: feedback, dtype: int64

### Tokenizing the Data With `spaCy`

In [24]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# create our list of punctuation marks
punctuations = string.punctuation

# create list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# creating our tokenizer function
def spacy_tokenizer(sentence):
    # create token object, which is used to create documents with linguistic annotations
    mytokens = parser(sentence)
    
    # lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in mytokens]
    
    # removing stop words
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    
    # return preprocessed list of tokens
    return mytokens

### Defining a Custom Transformer

In [25]:
# custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # cleaning text
        return [clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
    
# basic function to clean the text
def clean_text(text):
    # removing spaces and converting text into lowercase
    return text.strip().lower()

### Vectorization Feature Engineering (TF-IDF)

In [26]:
bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))

We’ll also want to look at the TF-IDF (Term Frequency-Inverse Document Frequency) for our terms. This sounds complicated, but it’s simply a way of normalizing our Bag of Words(BoW) by looking at each word’s frequency in comparison to the document frequency. 

- can use `scikit-learn`'s TfidVectorizer

In [27]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

### Splitting The Data into Training and Test Sets

In [28]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # features that we want to analyze
y = df_amazon['feedback'] # answers we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

### Creating a Pipeline and Generating the Model

In [31]:
# logistic regression classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver = 'liblinear')

# create pipeline using Bag of Words
pipe = Pipeline([('cleaner', predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)
                ])

# model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1a25069c18>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngra...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

### Evaluating the Model

- `Accuracy` refers to the percentage of the total predictions our model makes that are completely correct.
- `Precision` describes the ratio of true positives to true positives plus false positives in our predictions.
- `Recall` describes the ratio of true positives to true positives plus false negatives in our predictions.

In [32]:
from sklearn import metrics

#predicting with a test dataset
predicted = pipe.predict(X_test)

# model accuracy
print('Logistic Regression Accuracy: ', metrics.accuracy_score(y_test, predicted))
print('Logistic Regression Precision: ', metrics.precision_score(y_test, predicted))
print('Logistic Regression Recall: ', metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy:  0.9259259259259259
Logistic Regression Precision:  0.935093509350935
Logistic Regression Recall:  0.9872241579558653


In other words, overall, our model correctly identified a comment’s sentiment 94.1% of the time. When it predicted a review was positive, that review was actually positive 95% of the time. When handed a positive review, our model identified it as positive 98.6% of the time