In [1]:
# !pip install scikit-learn
# !pip install -U spacy
# !python -m spacy download en
# !python -m spacy download en_core_web_sm

In [2]:
import spacy 
from spacy import displacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
text = "Data Science plays a major part in monitoring a patient’s overall health and notifying necessary steps to be taken in order to prevent potential diseases from occurring. Data Scientists are using powerful predictive analytical tools to detect chronic diseases at an early level.In most extreme cases, there are instances where diseases are not caught at an early stage due to negligibility. This proves to be highly detrimental to not only the patient’s health but also the economic costs. As the disease grows, the cost of curing it also increases. Therefore, data science plays a huge role in optimizing the economic spending on healthcare."

In [5]:
doc1 = nlp(text)

In [6]:
type(doc1)

spacy.tokens.doc.Doc

In [7]:
for token in doc1:
    print(token)

Data
Science
plays
a
major
part
in
monitoring
a
patient
’s
overall
health
and
notifying
necessary
steps
to
be
taken
in
order
to
prevent
potential
diseases
from
occurring
.
Data
Scientists
are
using
powerful
predictive
analytical
tools
to
detect
chronic
diseases
at
an
early
level
.
In
most
extreme
cases
,
there
are
instances
where
diseases
are
not
caught
at
an
early
stage
due
to
negligibility
.
This
proves
to
be
highly
detrimental
to
not
only
the
patient
’s
health
but
also
the
economic
costs
.
As
the
disease
grows
,
the
cost
of
curing
it
also
increases
.
Therefore
,
data
science
plays
a
huge
role
in
optimizing
the
economic
spending
on
healthcare
.


In [8]:
for sent in doc1.sents:
    print(sent)

Data Science plays a major part in monitoring a patient’s overall health and notifying necessary steps to be taken in order to prevent potential diseases from occurring.
Data Scientists are using powerful predictive analytical tools to detect chronic diseases at an early level.
In most extreme cases, there are instances where diseases are not caught at an early stage due to negligibility.
This proves to be highly detrimental to not only the patient’s health but also the economic costs.
As the disease grows, the cost of curing it also increases.
Therefore, data science plays a huge role in optimizing the economic spending on healthcare.


In [9]:
from spacy.lang.en.stop_words import STOP_WORDS

In [10]:
stopwords = list(STOP_WORDS) 

In [11]:
for token in doc1:
    if token.is_stop == False:
        print(token)

Data
Science
plays
major
monitoring
patient
overall
health
notifying
necessary
steps
taken
order
prevent
potential
diseases
occurring
.
Data
Scientists
powerful
predictive
analytical
tools
detect
chronic
diseases
early
level
.
extreme
cases
,
instances
diseases
caught
early
stage
negligibility
.
proves
highly
detrimental
patient
health
economic
costs
.
disease
grows
,
cost
curing
increases
.
,
data
science
plays
huge
role
optimizing
economic
spending
healthcare
.


In [12]:
import string

In [13]:
punct = string.punctuation

In [14]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def text_clean(sentences):
    doc = nlp(sentences)
    
    tokens = []
    for token in doc:
        if token.lemma_!= "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [16]:
text_clean('data science plays a huge role in optimizing the economic spending on healthcare.')

['datum',
 'science',
 'play',
 'huge',
 'role',
 'optimize',
 'economic',
 'spending',
 'healthcare']

In [17]:
import pandas as pd
from  sklearn.svm import LinearSVC   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
tfidf = TfidfVectorizer(tokenizer= text_clean)
cls = LinearSVC()

In [19]:
data = pd.read_csv("C:/Users/jashwini_d/NLP_aj/amazon_cells_labelled.txt", sep = "\t", header=None)

In [20]:
data.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [21]:
col_list = ["Review", "Sentiment"]
data.columns = col_list

In [22]:
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [23]:
data.shape

(1000, 2)

In [24]:
data["Sentiment"].value_counts()

0    500
1    500
Name: Sentiment, dtype: int64

In [25]:
x = data["Review"]

In [26]:
y  = data["Sentiment"]

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [28]:
x_train.shape, x_test.shape

((800,), (200,))

In [29]:
clf = Pipeline([("tfidf", tfidf), ("clf", cls)], )

In [30]:
clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_clean at 0x000001826CD09F78>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=Tru

In [32]:
y_pred = clf.predict(x_test)

In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.77      0.77        93
           1       0.80      0.79      0.80       107

    accuracy                           0.79       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.79      0.79      0.79       200



In [34]:
confusion_matrix(y_test, y_pred)

array([[72, 21],
       [22, 85]], dtype=int64)

In [35]:
clf.predict(["the quality is very good"])

array([1], dtype=int64)

In [36]:
clf.predict(["the material is cheap"])

array([0], dtype=int64)

In [37]:
clf.predict(["the cost and delivery time is high"])

array([0], dtype=int64)

In [38]:
clf.predict(["money deducted but item not delivered"])

array([0], dtype=int64)