In [74]:
import nltk
import pandas as pd
import numpy as np
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

CC: Coordinating conjunction

CD: Cardinal number

DT: Determiner

EX: Existential there

FW: Foreign word

IN: Preposition or subordinating conjunction

JJ: Adjective

VP: Verb Phrase

JJR: Adjective, comparative

JJS: Adjective, superlative

LS: List item marker

MD: Modal

NN: Noun, singular or mass

NNS: Noun, plural

PP: Preposition Phrase

NNP: Proper noun, singular Phrase

NNPS: Proper noun, plural

PDT: Pre determiner

POS: Possessive ending

PRP: Personal pronoun Phrase

PRP: Possessive pronoun Phrase

RB: Adverb

RBR: Adverb, comparative

RBS: Adverb, superlative

RP: Particle

S: Simple declarative clause

SBAR: Clause introduced by a (possibly empty) subordinating conjunction

SBARQ: Direct question introduced by a wh-word or a wh-phrase.

SINV: Inverted declarative sentence, i.e. one in which the subject follows the tensed verb or modal.

SQ: Inverted yes/no question, or main clause of a wh-question, following the wh-phrase in SBARQ.

SYM: Symbol

VBD: Verb, past tense

VBG: Verb, gerund or present participle

VBN: Verb, past participle

VBP: Verb, non-3rd person singular present

VBZ: Verb, 3rd person singular present

WDT: Wh-determiner

WP: Wh-pronoun

WP: Possessive wh-pronoun

WRB: Wh-adverb

In [102]:
#Step1: Sentence segmentation.
reviews = ["This movie is the best! Will watch it again!", "Great Movie! Best one ever.", "Love it so much! Will watch, best of the best"]
word_list = []
for review in reviews:
    review = review.lower()
    sentences = nltk.sent_tokenize(review)
    for sentence in sentences:
        sentence = re.sub("[^a-zA-Z]", " ", sentence) 
        #Step2: Word Tokenization
        words = nltk.word_tokenize(sentence)
        #Step 3: Predicting parts off speech for each token
        tags = nltk.pos_tag(words)
        for word in words:   
            word_list.append(word)

In [103]:
#Step 4: Text Lemmatization
def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    """
    Print the results of stemmind and lemmitization using the passed stemmer, lemmatizer, word and pos (part of speech)
    """
#     print("Stemmer:", stemmer.stem(word))
#     print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    return lemmatizer.lemmatize(word, pos)

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

for i in range(len(word_list)):
    lem_word = compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = word_list[i], pos = wordnet.VERB)
    if word_list[i] != lem_word:
        word_list[i] = lem_word
print(word_list)

['this', 'movie', 'be', 'the', 'best', 'will', 'watch', 'it', 'again', 'great', 'movie', 'best', 'one', 'ever', 'love', 'it', 'so', 'much', 'will', 'watch', 'best', 'of', 'the', 'best']


In [104]:
#Step 5: Remove stop words
removed_stop_words = []
for word in word_list:
    if word not in stopwords.words("english"):
        removed_stop_words.append(word)
print(removed_stop_words)

['movie', 'best', 'watch', 'great', 'movie', 'best', 'one', 'ever', 'love', 'much', 'watch', 'best', 'best']


In [109]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

bag_of_words = count_vectorizer.fit_transform(reviews)

feature_names = count_vectorizer.get_feature_names()

pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,again,best,ever,great,is,it,love,movie,much,of,one,so,the,this,watch,will
0,1,1,0,0,1,1,0,1,0,0,0,0,1,1,1,1
1,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0
2,0,2,0,0,0,1,1,0,1,1,0,1,1,0,1,1


In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(reviews)

# Show the Model as a pandas DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,again,best,ever,great,is,it,love,movie,much,of,one,so,the,this,watch,will
0,0.400294,0.23642,0.0,0.0,0.400294,0.304434,0.0,0.304434,0.0,0.0,0.0,0.0,0.304434,0.400294,0.304434,0.304434
1,0.0,0.298032,0.504611,0.504611,0.0,0.0,0.0,0.38377,0.0,0.0,0.504611,0.0,0.0,0.0,0.0,0.0
2,0.0,0.425441,0.0,0.0,0.0,0.273916,0.360167,0.0,0.360167,0.360167,0.0,0.360167,0.273916,0.0,0.273916,0.273916
