In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:


df = pd.read_csv("train.csv")

In [3]:
df["text"] = df["text"].astype(str)

In [4]:
text = df.drop(["title", "id"], axis = 1)

In [7]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English 

# Create our list of punctuation marks
punctuations = string.punctuation

In [9]:
# Create our list of stopwords
nlp = English()
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [10]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [11]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [12]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [13]:
from sklearn.model_selection import train_test_split

X = text["text"] # the features we want to analyze
ylabels = text['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [20]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x0000024CF2976A48>),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop...
                                 tokenizer=<function spacy_tokenizer at 0x0000024CD66BB5E8>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
         

In [21]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9570512820512821
Logistic Regression Precision: 0.9515723270440252
Logistic Regression Recall: 0.9636942675159236


In [23]:
# Predicting with a test dataset
predicted_train = pipe.predict(X_train)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_train, predicted_train))
print("Logistic Regression Precision:",metrics.precision_score(y_train, predicted_train))
print("Logistic Regression Recall:",metrics.recall_score(y_train, predicted_train))

Logistic Regression Accuracy: 0.9809752747252747
Logistic Regression Precision: 0.9802306425041186
Logistic Regression Recall: 0.9817131857555341


In [None]:
print("Logistic Regression matrix:",confusion_matrix(y_test, predicted))

In [52]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(df["text"][1])
for token in doc:
    print(token.text, token.pos_, token.dep_)

Ever ADV advmod
get AUX ROOT
the DET det
feeling VERB dobj
your DET poss
life NOUN compound
circles VERB relcl
the DET det
roundabout NOUN dobj
rather ADV advmod
than SCONJ cc
heads NOUN pobj
in ADP prep
a DET det
straight ADJ amod
line NOUN pobj
toward ADP prep
the DET det
intended ADJ amod
destination NOUN pobj
? PUNCT punct
[ PUNCT punct
Hillary PROPN compound
Clinton PROPN nsubj
remains VERB ROOT
the DET det
big ADJ amod
woman NOUN attr
on ADP prep
campus NOUN pobj
in ADP prep
leafy PROPN pobj
, PUNCT punct
liberal ADJ amod
Wellesley PROPN conj
, PUNCT punct
Massachusetts PROPN appos
. PUNCT punct
Everywhere ADV advmod
else ADV advmod
votes VERB ROOT
her PRON dobj
most ADV advmod
likely ADJ amod
to PART aux
don VERB xcomp
her DET poss
inauguration NOUN compound
dress NOUN dobj
for ADP prep
the DET det
remainder NOUN pobj
of ADP prep
her DET poss
days NOUN pobj
the DET det
way NOUN npadvmod
Miss PROPN compound
Havisham PROPN nsubj
forever ADV advmod
wore VERB relcl
that DET det
wedd

and CCONJ cc
the DET det
challenge NOUN nsubj
now ADV advmod
is AUX conj
to PART aux
practice VERB xcomp
politics NOUN dobj
as SCONJ prep
the DET det
art NOUN pobj
of ADP prep
making VERB pcomp
what PRON nsubj
appears VERB ccomp
to PART aux
be AUX xcomp
impossible ADJ advmod
possible ADJ acomp
. PUNCT punct
” PUNCT punct
Now ADV advmod
, PUNCT punct
as SCONJ prep
the DET det
big ADJ amod
woman NOUN pobj
on ADP prep
campus NOUN pobj
but CCONJ cc
the DET det
odd ADJ amod
woman NOUN conj
out SCONJ prep
of ADP prep
the DET det
White PROPN compound
House PROPN pobj
, PUNCT punct
she PRON nsubj
wonders VERB ROOT
how ADV advmod
her DET poss
current ADJ amod
station NOUN nsubj
is AUX ccomp
even ADV advmod
possible ADJ acomp
. PUNCT punct
“ PUNCT punct
Why ADV advmod
are AUX ROOT
n’t PART neg
I PRON nsubj
50 NUM nummod
points NOUN npadvmod
ahead ADV advmod
? PUNCT punct
” PUNCT punct
she PRON nsubj
asked VERB ROOT
in ADP prep
September PROPN pobj
. PUNCT punct
In ADP prep
May PROPN pobj
she PRO

In [53]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Ever ever ADV RB advmod Xxxx True True
get get AUX VB ROOT xxx True True
the the DET DT det xxx True True
feeling feel VERB VBG dobj xxxx True False
your -PRON- DET PRP$ poss xxxx True True
life life NOUN NN compound xxxx True False
circles circle VERB VBZ relcl xxxx True False
the the DET DT det xxx True True
roundabout roundabout NOUN NN dobj xxxx True False
rather rather ADV RB advmod xxxx True True
than than SCONJ IN cc xxxx True True
heads head NOUN NNS pobj xxxx True False
in in ADP IN prep xx True True
a a DET DT det x True True
straight straight ADJ JJ amod xxxx True False
line line NOUN NN pobj xxxx True False
toward toward ADP IN prep xxxx True True
the the DET DT det xxx True True
intended intended ADJ JJ amod xxxx True False
destination destination NOUN NN pobj xxxx True False
? ? PUNCT . punct ? False False
[ [ PUNCT -LRB- punct [ False False
Hillary Hillary PROPN NNP compound Xxxxx True False
Clinton Clinton PROPN NNP nsubj Xxxxx True False
remains remain VERB VBZ ROOT xx

in in ADP IN prep xx True True
her -PRON- DET PRP$ poss xxx True True
senior senior ADJ JJ amod xxxx True False
thesis thesis NOUN NN pobj xxxx True False
on on ADP IN prep xx True True
Saul Saul PROPN NNP compound Xxxx True False
Alinsky Alinsky PROPN NNP pobj Xxxxx True False
lamented lament VERB VBD ROOT xxxx True False
“ " PUNCT `` punct “ False False
Black Black PROPN NNP compound Xxxxx True False
Power Power PROPN NNP compound Xxxxx True False
demagogues demagogue NOUN NNS nsubj xxxx True False
” " PUNCT '' punct ” False False
and and CCONJ CC cc xxx True True
“ " PUNCT `` punct “ False False
elitist elitist ADJ JJ amod xxxx True False
arrogance arrogance NOUN NN conj xxxx True False
and and CCONJ CC cc xxx True True
repressive repressive ADJ JJ amod xxxx True False
intolerance intolerance NOUN NN conj xxxx True False
” " PUNCT '' punct ” False False
within within ADP IN prep xxxx True True
the the DET DT det xxx True True
New New PROPN NNP compound Xxx True False
Left Left PROPN

and and CCONJ CC cc xxx True True
his -PRON- DET PRP$ poss xxx True True
supporters supporter NOUN NNS conj xxxx True False
as as SCONJ IN prep xx True True
embracing embrace VERB VBG pcomp xxxx True False
a a DET DT det x True True
“ " PUNCT `` punct “ False False
lie lie NOUN NN dobj xxx True False
, , PUNCT , punct , False False
” " PUNCT '' punct ” False False
a a DET DT det x True True
“ " PUNCT `` punct “ False False
con con NOUN NN appos xxx True False
, , PUNCT , punct , False False
” " PUNCT '' punct ” False False
“ " PUNCT `` punct “ False False
alternative alternative ADJ JJ amod xxxx True False
facts fact NOUN NNS appos xxxx True False
, , PUNCT , punct , False False
” " PUNCT '' punct ” False False
and and CCONJ CC cc xxx True True
“ " PUNCT `` punct “ False False
a a DET DT det x True True
      SPACE _SP     False False
assault assault NOUN NN conj xxxx True False
on on ADP IN prep xx True True
truth truth NOUN NN pobj xxxx True False
and and CCONJ CC cc xxx True True
re

” " PUNCT '' punct ” False False
she -PRON- PRON PRP nsubj xxx True True
asked ask VERB VBD ROOT xxxx True False
in in ADP IN prep xx True True
September September PROPN NNP pobj Xxxxx True False
. . PUNCT . punct . False False
In in ADP IN prep Xx True True
May May PROPN NNP pobj Xxx True True
she -PRON- PRON PRP nsubj xxx True True
asks ask VERB VBZ ROOT xxxx True False
why why ADV WRB advmod xxx True True
she -PRON- PRON PRP nsubj xxx True True
is be AUX VBZ ccomp xx True True
n’t not PART RB neg x’x False True
president president NOUN NN attr xxxx True False
. . PUNCT . punct . False False
The the DET DT det Xxx True True
woman woman NOUN NN nsubj xxxx True False
famously famously ADV RB advmod xxxx True False
dubbed dub VERB VBD ROOT xxxx True False
a a DET DT det x True True
“ " PUNCT `` punct “ False False
congenital congenital ADJ JJ amod xxxx True False
liar liar NOUN NN oprd xxxx True False
” " PUNCT '' punct ” False False
by by ADP IN prep xx True True
Bill Bill PROPN NNP co

In [54]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Hillary Clinton 125 140 PERSON
leafy 176 181 GPE
Wellesley 191 200 GPE
Massachusetts 202 215 GPE
Havisham 328 336 PERSON
Hillary Rodham 403 417 PERSON
48 years ago 439 451 DATE
first 461 466 ORDINAL
Wellesley 479 488 GPE
1969 563 567 DATE
Democratic 691 701 NORP
2016 715 719 DATE
Seven 761 766 CARDINAL
Adams 810 815 PERSON
today 858 863 DATE
400 884 887 CARDINAL
Rodham 901 907 PERSON
Edger Bergen 954 966 PERSON
Charlie McCarthys 974 991 PERSON
Mortimer Snerds 996 1011 PERSON
John Lennon 1110 1121 PERSON
Edward Brooke 1265 1278 PERSON
Clinton 1388 1395 PERSON
Saul Alinsky 1420 1432 PERSON
Black Power 1443 1454 WORK_OF_ART
the New Left 1525 1537 LOC
Republican 1569 1579 NORP
Rodham 1620 1626 PERSON
1969 1650 1654 DATE
one 1665 1668 CARDINAL
Clinton 2109 2116 PERSON
2016 2119 2123 DATE
Whitewater 2203 2213 ORG
Travelgate 2215 2225 ORG
Benghazi 2250 2258 PERSON
Clinton 2284 2291 PERSON
Friday 2330 2336 DATE
American 2450 2458 NORP
Trump 2469 2474 PERSON
48 percent 2485 2495 PERCENT
38 perc

In [55]:
from spacy.lang.en import English

In [58]:
from spacy import displacy

displacy.serve(doc, style="ent")

  "__main__", mod_spec)



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
