# Text Classification for sentiments

We will work with Yelp,IMDB and Amazon datasets

In [1]:
import pandas as pd

In [2]:
# Loading data
df_yelp = pd.read_table('yelp_labelled.txt')
df_imdb = pd.read_table('imdb_labelled.txt')
df_amz = pd.read_table('amazon_cells_labelled.txt')

### Combining datasets

In [3]:
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]

In [4]:
# Renaming Column Headers
for colname in frames:
    colname.columns = ["Message","Target"]

In [5]:
# Column names
for colname in frames:
    print(colname.columns)

Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [6]:
# Assign a Key to Make it Easier
keys = ['Yelp','IMDB','Amazon']

In [7]:
# Merge or Concat our Datasets
df = pd.concat(frames,keys=keys)

In [8]:
# Length and Shape 
df.shape

(2745, 2)

In [9]:
df.head()

Unnamed: 0,Unnamed: 1,Message,Target
Yelp,0,Crust is not good.,0
Yelp,1,Not tasty and the texture was just nasty.,0
Yelp,2,Stopped by during the late May bank holiday of...,1
Yelp,3,The selection on the menu was great and so wer...,1
Yelp,4,Now I am getting angry and I want my damn pho.,0


In [10]:
# saving
df.to_csv("sentimentdataset.csv")

In [11]:
# Data Cleaning
df.columns

Index(['Message', 'Target'], dtype='object')

In [12]:
#checking missing values
df[df.isnull().any(axis=1)].head()

Unnamed: 0,Unnamed: 1,Message,Target


## Cleaning text data

In [13]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')

In [14]:
# Build a list of stopwords to use to filter
stopwords = list(STOP_WORDS)
len(stopwords)

312

In [15]:
print(stopwords)

['behind', "'ll", 'before', 'others', 'neither', 'an', 'when', 'himself', "'ve", 'about', 'now', 'than', 'seeming', 'during', 'along', 'down', 'if', 'thereupon', 'three', 'thus', 'would', 'keep', 'into', 'only', 'so', 'her', 'alone', 'make', 'nor', 'few', 'hereby', 'serious', 'get', 'therein', 'often', 'how', 'whatever', 'after', 'up', 'until', 'formerly', 'myself', 'from', 'becomes', 'noone', 'no', 'above', 'besides', 'of', 'do', 'ca', 'else', 'among', 'his', 'name', 'amongst', 'doing', 'quite', 'these', 'anyhow', 'being', 'already', 'sometime', 'was', 'indeed', 'i', 'nothing', 'cannot', 'does', 'just', 'he', 'has', 'were', 'all', 'someone', 'fifty', 'latter', 'or', 'under', 'both', 'whereas', 'done', 'hers', 'wherever', 'whither', 'some', 'over', 'nowhere', 'whoever', 'moreover', 'amount', 'here', 'same', 'such', 'using', 'between', 'because', 'many', 'though', 'becoming', 'front', 'yet', 'whole', 'we', 'again', 'much', 'unless', 'elsewhere', 'by', 'and', 'empty', 'this', 'latterly',

### Getting Lemma and Stop words

In [16]:
#example before we apply to our dataset
docx = nlp("This is how John Walker was walking. He was also running beside the lawn.")

In [17]:
# Lemmatizing of tokens
for word in docx:
    print(word.text,"Lemma =>",word.lemma_)

This Lemma => This
is Lemma => be
how Lemma => how
John Lemma => John
Walker Lemma => Walker
was Lemma => be
walking Lemma => walk
. Lemma => .
He Lemma => -PRON-
was Lemma => be
also Lemma => also
running Lemma => run
beside Lemma => beside
the Lemma => the
lawn Lemma => lawn
. Lemma => .


In [18]:
# Lemma that are not pronouns
for word in docx:
    if word.lemma_ != "-PRON-":
        print(word.lemma_.lower().strip())

this
be
how
john
walker
be
walk
.
be
also
run
beside
the
lawn
.


In [19]:
# List Comprehensions of our Lemma
[word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in docx]

['this',
 'be',
 'how',
 'john',
 'walker',
 'be',
 'walk',
 '.',
 'he',
 'be',
 'also',
 'run',
 'beside',
 'the',
 'lawn',
 '.']

In [20]:
# Filtering out Stopwords and Punctuations
for word in docx:
    if word.is_stop == False and not word.is_punct:
#     if word.is_stop != True and not word.is_punct:
        print(word)

John
Walker
walking
running
lawn


In [21]:
# Stop words and Punctuation In List Comprehension
[ word for word in docx if word.is_stop == False and not word.is_punct ]

[John, Walker, walking, running, lawn]

In [22]:
# Use the punctuations of string module
import string
punctuations = string.punctuation

In [23]:
# Creating a Spacy Parser
from spacy.lang.en import English
parser = English()

In [24]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

## Machine Learning With SKlearn

In [25]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [26]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [27]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [28]:
# Using Tfidf
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [29]:
# Splitting Data Set
from sklearn.model_selection import train_test_split
# Features and Labels
X = df['Message']
ylabels = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=2019)

In [30]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [31]:
# Fit our data
pipe.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x0000019F252DEBE0>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
      ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [32]:
# Predicting with a test dataset
sample_prediction = pipe.predict(X_test)

# Prediction Results

 1 = Positive review<br>
 0 = Negative review

In [33]:
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)

Everything worked on the first try.The device was certainly engineered in a clever way and the construction feels good. Prediction=> 0
Great it was new packaged nice works good, no problems and it came in less time then I expected!!!! Prediction=> 1
I must say I have taped most of the episodes and i find myself watching them over and over again.   Prediction=> 1
I was VERY disappointed!! Prediction=> 0
It's pretty surprising that this wonderful film was made in 1949, as Hollywood generally had its collective heads in the sand concerning black and white issues at that time.   Prediction=> 1
if you simply want a small flip phone -- look elsewhere as the extra bells & whistles are mediocre. Prediction=> 0
It is indescribably the most annoying and idiotic show I have ever seen.   Prediction=> 0
This is one of Peter Watkins most accessible films.   Prediction=> 1
Internet is excrutiatingly slow. Prediction=> 0
It is light, easy to use, and has very clear reception and transmission. Predicti

Every single character was hilarious and deserved to be called a lead.   Prediction=> 0
First time going but I think I will quickly become a regular. Prediction=> 1
I love that they put their food in nice plastic containers as opposed to cramming it in little paper takeout boxes. Prediction=> 1
(It wasn't busy either) Also, the building was FREEZING cold. Prediction=> 0
Nothing special. Prediction=> 1
Phone now holds charge like it did when it was new. Prediction=> 0
This mostly routine fact-based TV drama gets a boost from the fine performance by Cole.   Prediction=> 1
This product is very High quality Chinese CRAP!!!!!! Prediction=> 0
The worst was the salmon sashimi. Prediction=> 0
I believe the screenwriter did a good job of tying up the loose ends.   Prediction=> 1
We were promptly greeted and seated. Prediction=> 1
I don't have very many words to say about this place, but it does everything pretty well. Prediction=> 1
That's how I'd describe this painfully dreary time-waster of a

Every time I eat here, I see caring teamwork to a professional degree. Prediction=> 0
DO NOT BUY DO NOT BUYIT SUCKS Prediction=> 0
IMDB ratings only go as low 1 for awful, it's time to get some negative numbers in there for cases such as these.   Prediction=> 0
The case is great and works fine with the 680. Prediction=> 1
By this point, my friends and I had basically figured out this place was a joke and didn't mind making it publicly and loudly known. Prediction=> 0
Great Earpiece. Prediction=> 1
VERY funny!   Prediction=> 1
But the convoluted plot just didn't convince me, and much of the film was watched with a weird, questioning glance.   Prediction=> 0
I connected my wife's bluetooth,(Motorola HS850) to my phone and it worked like a charm whether the phone was in my pocket or the case. Prediction=> 1
The building itself seems pretty neat; the bathroom is pretty trippy, but I wouldn't eat here again. Prediction=> 1
Yet Plantronincs continues to use the same flawed charger design. Pr

Main thing I didn't enjoy is that the crowd is of older crowd, around mid 30s and up. Prediction=> 1
Hayao Miyazaki's latest and eighth film for Studio Ghibili, "Gake No Ue No Ponyo" (Ponyo on the Cliff by the Sea) is a wonderfully fun and imaginative look at childhood.   Prediction=> 1
No ear loop needed, it's tiny and the sound is great. Prediction=> 1
I would highly recommend this. Prediction=> 1
I consider this theft. Prediction=> 1
Unfortunately, 'Cover Girl' is an example of how Hollywood used to exploit women for financial gain.   Prediction=> 0
The transfers are very good.   Prediction=> 1
I liked this movie way too much.   Prediction=> 0
My boyfriend tried the Mediterranean Chicken Salad and fell in love. Prediction=> 1
However, the ear pads come off easily and after only one week I lost one. Prediction=> 0
The deal included 5 tastings and 2 drinks, and Jeff went above and beyond what we expected. Prediction=> 1
Your staff spends more time talking to themselves than me. Predic

But when someone strives for greatness and poetry, but delivers a muddled (and often ridiculous) story, a bunch of disparate scenes, pretentious dialogue... Then you get the worst kind of a movie that some other reviewer very accurately defined as pretentious crap".   Prediction=> 1
This place deserves no stars. Prediction=> 0
It feels more comfortable than most headsets because I wear glasses and that gets in the way sometimes. Prediction=> 1
The item received was Counterfeit. Prediction=> 0
Works great!. Prediction=> 1
It's so bad it's actually worth seeing just for that reason.   Prediction=> 0
This scene is very strong and unpleasant.   Prediction=> 1
good item, low price. Prediction=> 0
very disappointed. Prediction=> 0
All the characters in this film are tremendously well played.   Prediction=> 1
Insults, profound deuchebaggery, and had to go outside for a smoke break while serving just to solidify it. Prediction=> 0
Things that went wrong: - They burned the saganaki. Prediction=

The range is very decent, I've been able to roam around my house with the phone in the living room with no reception/sound quality issues. Prediction=> 1
) some great music, and terrific scenery.   Prediction=> 1
$50 Down the drain. Prediction=> 0
Really good product. Prediction=> 1
I love this bluetooth! Prediction=> 1
Same evening, him and I are both drastically sick. Prediction=> 0
This phone is pretty sturdy and I've never had any large problems with it. Prediction=> 1
Director Paul Matthews, who also wrote/directed the weak 1995 monster movie "Grim", clearly doesn't know how to pace his films.   Prediction=> 1
However, there was so much garlic in the fondue, it was barely edible. Prediction=> 0
Freezes frequently4. Prediction=> 0
We definately enjoyed ourselves. Prediction=> 1
There is so much good food in Vegas that I feel cheated for wasting an eating opportunity by going to Rice and Company. Prediction=> 0
A world better than 95% of the garbage in the theatres today.   Predicti

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Accuracy

In [34]:
print("Accuracy: ",pipe.score(X_test,y_test))
print("Accuracy: ",pipe.score(X_test,sample_prediction))

Accuracy:  0.7868852459016393
Accuracy:  1.0


In [35]:
# Accuracy on train set
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.9785974499089253


## Using Tf-idf

In [36]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe_tfid = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [37]:
pipe_tfid.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x0000019F254C15C0>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
    ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [38]:
sample_prediction1 = pipe_tfid.predict(X_test)

In [39]:
for (sample,pred) in zip(X_test,sample_prediction1):
    print(sample,"Prediction=>", pred)

Everything worked on the first try.The device was certainly engineered in a clever way and the construction feels good. Prediction=> 0
Great it was new packaged nice works good, no problems and it came in less time then I expected!!!! Prediction=> 1
I must say I have taped most of the episodes and i find myself watching them over and over again.   Prediction=> 1
I was VERY disappointed!! Prediction=> 0
It's pretty surprising that this wonderful film was made in 1949, as Hollywood generally had its collective heads in the sand concerning black and white issues at that time.   Prediction=> 1
if you simply want a small flip phone -- look elsewhere as the extra bells & whistles are mediocre. Prediction=> 0
It is indescribably the most annoying and idiotic show I have ever seen.   Prediction=> 0
This is one of Peter Watkins most accessible films.   Prediction=> 1
Internet is excrutiatingly slow. Prediction=> 0
It is light, easy to use, and has very clear reception and transmission. Predicti

Every single character was hilarious and deserved to be called a lead.   Prediction=> 1
First time going but I think I will quickly become a regular. Prediction=> 1
I love that they put their food in nice plastic containers as opposed to cramming it in little paper takeout boxes. Prediction=> 1
(It wasn't busy either) Also, the building was FREEZING cold. Prediction=> 0
Nothing special. Prediction=> 1
Phone now holds charge like it did when it was new. Prediction=> 0
This mostly routine fact-based TV drama gets a boost from the fine performance by Cole.   Prediction=> 1
This product is very High quality Chinese CRAP!!!!!! Prediction=> 0
The worst was the salmon sashimi. Prediction=> 0
I believe the screenwriter did a good job of tying up the loose ends.   Prediction=> 0
We were promptly greeted and seated. Prediction=> 1
I don't have very many words to say about this place, but it does everything pretty well. Prediction=> 1
That's how I'd describe this painfully dreary time-waster of a

Every time I eat here, I see caring teamwork to a professional degree. Prediction=> 0
DO NOT BUY DO NOT BUYIT SUCKS Prediction=> 0
IMDB ratings only go as low 1 for awful, it's time to get some negative numbers in there for cases such as these.   Prediction=> 0
The case is great and works fine with the 680. Prediction=> 1
By this point, my friends and I had basically figured out this place was a joke and didn't mind making it publicly and loudly known. Prediction=> 0
Great Earpiece. Prediction=> 1
VERY funny!   Prediction=> 1
But the convoluted plot just didn't convince me, and much of the film was watched with a weird, questioning glance.   Prediction=> 0
I connected my wife's bluetooth,(Motorola HS850) to my phone and it worked like a charm whether the phone was in my pocket or the case. Prediction=> 1
The building itself seems pretty neat; the bathroom is pretty trippy, but I wouldn't eat here again. Prediction=> 1
Yet Plantronincs continues to use the same flawed charger design. Pr

Main thing I didn't enjoy is that the crowd is of older crowd, around mid 30s and up. Prediction=> 0
Hayao Miyazaki's latest and eighth film for Studio Ghibili, "Gake No Ue No Ponyo" (Ponyo on the Cliff by the Sea) is a wonderfully fun and imaginative look at childhood.   Prediction=> 1
No ear loop needed, it's tiny and the sound is great. Prediction=> 1
I would highly recommend this. Prediction=> 1
I consider this theft. Prediction=> 1
Unfortunately, 'Cover Girl' is an example of how Hollywood used to exploit women for financial gain.   Prediction=> 0
The transfers are very good.   Prediction=> 1
I liked this movie way too much.   Prediction=> 0
My boyfriend tried the Mediterranean Chicken Salad and fell in love. Prediction=> 1
However, the ear pads come off easily and after only one week I lost one. Prediction=> 0
The deal included 5 tastings and 2 drinks, and Jeff went above and beyond what we expected. Prediction=> 1
Your staff spends more time talking to themselves than me. Predic

But when someone strives for greatness and poetry, but delivers a muddled (and often ridiculous) story, a bunch of disparate scenes, pretentious dialogue... Then you get the worst kind of a movie that some other reviewer very accurately defined as pretentious crap".   Prediction=> 1
This place deserves no stars. Prediction=> 0
It feels more comfortable than most headsets because I wear glasses and that gets in the way sometimes. Prediction=> 1
The item received was Counterfeit. Prediction=> 0
Works great!. Prediction=> 1
It's so bad it's actually worth seeing just for that reason.   Prediction=> 0
This scene is very strong and unpleasant.   Prediction=> 1
good item, low price. Prediction=> 0
very disappointed. Prediction=> 0
All the characters in this film are tremendously well played.   Prediction=> 1
Insults, profound deuchebaggery, and had to go outside for a smoke break while serving just to solidify it. Prediction=> 0
Things that went wrong: - They burned the saganaki. Prediction=

The range is very decent, I've been able to roam around my house with the phone in the living room with no reception/sound quality issues. Prediction=> 1
) some great music, and terrific scenery.   Prediction=> 1
$50 Down the drain. Prediction=> 0
Really good product. Prediction=> 1
I love this bluetooth! Prediction=> 1
Same evening, him and I are both drastically sick. Prediction=> 0
This phone is pretty sturdy and I've never had any large problems with it. Prediction=> 1
Director Paul Matthews, who also wrote/directed the weak 1995 monster movie "Grim", clearly doesn't know how to pace his films.   Prediction=> 0
However, there was so much garlic in the fondue, it was barely edible. Prediction=> 0
Freezes frequently4. Prediction=> 0
We definately enjoyed ourselves. Prediction=> 1
There is so much good food in Vegas that I feel cheated for wasting an eating opportunity by going to Rice and Company. Prediction=> 0
A world better than 95% of the garbage in the theatres today.   Predicti

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [40]:
print("Accuracy: ",pipe_tfid.score(X_test,y_test))
print("Accuracy: ",pipe_tfid.score(X_test,sample_prediction1))

Accuracy:  0.8069216757741348
Accuracy:  1.0


In [41]:
# Accuracy on train set
print("Accuracy: ",pipe_tfid.score(X_train,y_train))

Accuracy:  0.9735883424408015
