In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
paragraph = """
              When I was 17, I read a quote that went something like: “If you live each day as if it was your last,
              someday you’ll most certainly be right.” It made an impression on me, and since then, for the past 33 years,
              I have looked in the mirror every morning and asked myself: “If today were the last day of my life,
              would I want to do what I am about to do today?” And whenever the answer has been “No” for too many days in a row,
               I know I need to change something.
               No one wants to die. Even people who want to go to heaven don’t want to
               die to get there.
               And yet death is the destination we all share.
               No one has ever escaped it. And that is as it should be, because
               Death is very likely the single best invention of Life. It is Life’s change agent.
               It clears out the old to make way for the new. Right now the new is you, but
               someday not too long from now,
               you will gradually become the old and be cleared away. Sorry to be so dramatic,
               but it is quite true.
              Your time is limited, so don’t waste it living someone else’s life.
              Don’t be trapped by dogma — which is living with the results of other people’s thinking.
              Don’t let the noise of others’ opinions drown out your own inner voice.
              And most important, have the courage to follow your heart and intuition.
              They somehow already know what you truly want to become. Everything else is secondary.
              When I was young, there was an amazing publication called The Whole Earth Catalog,
               which was one of the bibles of my generation.
               It was created by a fellow named Stewart Brand not far from here in Menlo Park, and
               he brought it to life with his poetic touch. This was in the late 1960s, before
                personal computers and desktop publishing, so it was all made with typewriters,
                scissors and Polaroid cameras.
                It was sort of like Google in paperback form, 35 years before Google came along:
                It was idealistic, and overflowing with neat tools and great notions.
              """


In [4]:
#define objects for stemming and lemmatizer
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

#sentence tokenization
sentence = nltk.sent_tokenize(paragraph)
sentence

['\n              When I was 17, I read a quote that went something like: “If you live each day as if it was your last, \n              someday you’ll most certainly be right.” It made an impression on me, and since then, for the past 33 years, \n              I have looked in the mirror every morning and asked myself: “If today were the last day of my life, \n              would I want to do what I am about to do today?” And whenever the answer has been “No” for too many days in a row,\n               I know I need to change something.',
 'No one wants to die.',
 'Even people who want to go to heaven don’t want to \n               die to get there.',
 'And yet death is the destination we all share.',
 'No one has ever escaped it.',
 'And that is as it should be, because \n               Death is very likely the single best invention of Life.',
 'It is Life’s change agent.',
 'It clears out the old to make way for the new.',
 'Right now the new is you, but \n               someday not 

In [12]:
import re #regular expression
corpus = [] # corpus list

for i in range(len(sentence)):
    rev = re.sub('[^a-zA-Z]', ' ', sentence[i])
    rev = rev.lower()
    rev = rev.split()
    rev = [stemmer.stem(word) for word in rev if not word in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    corpus.append(rev)

In [13]:
corpus

['read quot went someth like live day last someday certainli right made impress sinc past year look mirror everi morn ask today last day life would want today whenev answer mani day row know need chang someth',
 'one want die',
 'even peopl want go heaven want die get',
 'yet death destin share',
 'one ever escap',
 'death like singl best invent life',
 'life chang agent',
 'clear old make way new',
 'right new someday long gradual becom old clear away',
 'sorri dramat quit true',
 'time limit wast live someon els life',
 'trap dogma live result peopl think',
 'let nois other opinion drown inner voic',
 'import courag follow heart intuit',
 'somehow alreadi know truli want becom',
 'everyth els secondari',
 'young amaz public call whole earth catalog one bibl gener',
 'creat fellow name stewart brand far menlo park brought life poetic touch',
 'late person comput desktop publish made typewrit scissor polaroid camera',
 'sort like googl paperback form year googl came along idealist over

In [7]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [8]:
X #Presence of a word in a sentence => 1
  #Absence of a word in a sentence => 0

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0]])