In [None]:
!pip install scikit-learn
!pip install U spacy
!python -m spacy download en
!python -m spacy download en_core_web_sm

What is a Bag of Words in NLP?
Bag of words is a Natural Language Processing technique of text modelling. In technical terms, we can say that it is a method of feature extraction with text data. This approach is a simple and flexible way of extracting features from documents.

A bag of words is a representation of text that describes the occurrence of words within a document. We just keep track of word counts and disregard the grammatical details and the word order. It is called a “bag” of words because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

What Is TF-IDF?
TF-IDF is a statistical measure that evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

It has many uses, most importantly in automated text analysis, and is very useful for scoring words in machine learning algorithms for Natural Language Processing (NLP).

TF-IDF (term frequency-inverse document frequency) was invented for document search and information retrieval. It works by increasing proportionally to the number of times a word appears in a document, but is offset by the number of documents that contain the word. So, words that are common in every document, such as this, what, and if, rank low even though they may appear many times, since they don’t mean much to that document in particular.

However, if the word Bug appears many times in a document, while not appearing many times in others, it probably means that it’s very relevant. For example, if what we’re doing is trying to find out which topics some NPS responses belong to, the word Bug would probably end up being tied to the topic Reliability, since most responses containing that word would be about that topic.

In [None]:
import spacy
from spacy import displacy

In [None]:
nlp=spacy.load('en_core_web_sm')

In [None]:
text="This is first sentence Google, and Yahoo this is another one. Here's Hotmail the third."

In [None]:
doc=nlp(text)
doc

In [None]:
for token in doc:
    print(token)

In [None]:
sent=nlp.create_pipe("sentencizer")
nlp.add_pipe(sent,before="parser")
doc=nlp(text)

for sent in doc.sents:
    print(sent)

In [None]:
#Removing stop words
from spacy.lang.en.stop_words import STOP_WORDS
stopwords=list(STOP_WORDS)
print(stopwords)

In [None]:
len(stopwords)

In [None]:
for token in doc:
    if token.is_stop==False:
        print(token)

In [None]:
#Lemmatization
doc=nlp("run runs running runner")
for lem in doc:
    print(lem.text,lem.lemma_)

In [None]:
#Part of Speech POS
doc=nlp("All is well at your end!")
for token in doc:
    print(token.text, token.pos_)

In [None]:
displacy.render(doc,style='dep')

In [None]:
#Entity Detection
doc=nlp("New York City on Tuesday declares a public health emergency and ordered mandatory measles vaccinations amid an outbreak, becoming the latest national flash point over refusals to innoculate against dangerous diseases. At least 25 people have contracted measles in the city since September, mostly in the Brooklyn'S Williamburg neighbourhood. The order covers four Zip codes there, Mayor Bill de Biasio (D) said on Tuesday. The mandate orders all unvaccinated people in the area, including a concentration of Orthodox Jews, to receive innoculations, including for children as young as 6 months old. Anyone who resists could be fined up to $1000.")
doc

In [None]:
displacy.render(doc,style='ent')
#Need to write doc as doc=nlp("...") otherwise wont work

In [None]:
#Text classification
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
data_yelp=pd.read_csv('../input/yelplabelled-sentences/yelp_labelled.txt', sep='\t',header=None)
data_yelp.head()

In [None]:
columns_name=['Review','Sentiment']
data_yelp.columns=columns_name
data_yelp.head()

In [None]:
data_yelp.shape

In [None]:
data_amazon=pd.read_csv('../input/imdb-and-amazon-reviews-dataset/amazon_cells_labelled.txt',sep='\t',header=None)
data_amazon.columns=columns_name
data_amazon.head()

In [None]:
data_amazon.shape

In [None]:
data_imdb=pd.read_csv('../input/imdb-and-amazon-reviews-dataset/imdb_labelled.txt', sep='\t',header=None)
data_imdb.columns=columns_name
data_imdb.head()

In [None]:
data_imdb.shape

In [None]:
data=data_yelp.append([data_amazon,data_imdb],ignore_index=True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data['Sentiment'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
#Tokenization
import string
punct=string.punctuation
punct

In [None]:
def text_data_cleaning(sentence):
    doc=nlp(sentence)
    tokens=[]
    for token in doc:
        if token.lemma_ !="-PRON-":
            temp=token.lemma_.lower().strip()
        else:
            temp=token.lower_
        tokens.append(temp)
    cleaned_tokens=[]
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens
text_data_cleaning("Hello how are you today?! I like this video")

In [None]:
#Vectorization Feature Engineering (TF-IDF)
from sklearn.svm import LinearSVC
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier= LinearSVC()
X=data['Review']
y=data['Sentiment']
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

In [None]:
clf=Pipeline([('tfidf',tfidf),('clf',classifier)])
clf.fit(X_train,y_train)

In [None]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
clf.predict(['Wow, this is an amazing lesson'])

In [None]:
clf.predict(['Wow, this sucks'])

In [None]:
clf.predict(['Worth of watching it. Please like it'])

In [None]:
clf.predict(['Loved it. amazing'])