In [1]:
paragraph = """
Came for lunch with my sister. We loved our Thai-style mains which were amazing with lots of flavour, very impressive for a vegetarian restaurant.

But the service was below average and the chips were too terrible to finish.

When we arrived at 1.40, we had to wait 20 minutes while they got our table ready. OK, so we didn't have a reservation, but the restaurant was only half full. There was no reason to make us wait at all.

We ordered the chips as a side dish and they looked delicious. But, when we tasted them, they were overcooked and swimming in oil so we left most of them. We expected a lot more for $10!

When the waiter asked if everything was ok, we said we really didn't like the chips and he said 'That's funny, I love them' and that was it. He didn't offer us anything else or take them off our bill. Also, when we didn't leave a tip, he looked annoyed.

I was really excited about visiting Vega, and the mains were just fantastic, but the rest of the experience was really disappointing.
"""

In [2]:
import nltk

## Breaking Paragraph at sentence level

In [3]:
sentences = nltk.sent_tokenize(paragraph)
for sent in sentences:
    print(sent)


Came for lunch with my sister.
We loved our Thai-style mains which were amazing with lots of flavour, very impressive for a vegetarian restaurant.
But the service was below average and the chips were too terrible to finish.
When we arrived at 1.40, we had to wait 20 minutes while they got our table ready.
OK, so we didn't have a reservation, but the restaurant was only half full.
There was no reason to make us wait at all.
We ordered the chips as a side dish and they looked delicious.
But, when we tasted them, they were overcooked and swimming in oil so we left most of them.
We expected a lot more for $10!
When the waiter asked if everything was ok, we said we really didn't like the chips and he said 'That's funny, I love them' and that was it.
He didn't offer us anything else or take them off our bill.
Also, when we didn't leave a tip, he looked annoyed.
I was really excited about visiting Vega, and the mains were just fantastic, but the rest of the experience was really disappointin

### Using regular expression for dropping all text except capital and small characters and numbersfrom 0 to 9

In [4]:
import re
corpus = []
for sent in sentences:
    corpus.append(re.sub('[^a-zA-Z0-9]', " ", sent).lower())
print(corpus)

[' came for lunch with my sister ', 'we loved our thai style mains which were amazing with lots of flavour  very impressive for a vegetarian restaurant ', 'but the service was below average and the chips were too terrible to finish ', 'when we arrived at 1 40  we had to wait 20 minutes while they got our table ready ', 'ok  so we didn t have a reservation  but the restaurant was only half full ', 'there was no reason to make us wait at all ', 'we ordered the chips as a side dish and they looked delicious ', 'but  when we tasted them  they were overcooked and swimming in oil so we left most of them ', 'we expected a lot more for  10 ', 'when the waiter asked if everything was ok  we said we really didn t like the chips and he said  that s funny  i love them  and that was it ', 'he didn t offer us anything else or take them off our bill ', 'also  when we didn t leave a tip  he looked annoyed ', 'i was really excited about visiting vega  and the mains were just fantastic  but the rest of 

## Printing each doc in corpus separately just to view

In [5]:
for doc_number, corp in enumerate(corpus):
    print(doc_number, corp)

0  came for lunch with my sister 
1 we loved our thai style mains which were amazing with lots of flavour  very impressive for a vegetarian restaurant 
2 but the service was below average and the chips were too terrible to finish 
3 when we arrived at 1 40  we had to wait 20 minutes while they got our table ready 
4 ok  so we didn t have a reservation  but the restaurant was only half full 
5 there was no reason to make us wait at all 
6 we ordered the chips as a side dish and they looked delicious 
7 but  when we tasted them  they were overcooked and swimming in oil so we left most of them 
8 we expected a lot more for  10 
9 when the waiter asked if everything was ok  we said we really didn t like the chips and he said  that s funny  i love them  and that was it 
10 he didn t offer us anything else or take them off our bill 
11 also  when we didn t leave a tip  he looked annoyed 
12 i was really excited about visiting vega  and the mains were just fantastic  but the rest of the exper

## Use of stemmer to convert the original word to base word and filtering stopwords

In [6]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

stemmer = PorterStemmer()
stemmed_corpus = []
stop_words = []
# separating words and stop words and applying stemming on words.
for sent in corpus:
    stemmed_doc = ""
    words = nltk.word_tokenize(sent)
    for word in words:
        if word not in stopwords.words('english'):
            stemmed_doc +=stemmer.stem(word)+ " "
        else:
            stop_words.append(word)
    stemmed_corpus.append(stemmed_doc)
stemmed_corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['came lunch sister ',
 'love thai style main amaz lot flavour impress vegetarian restaur ',
 'servic averag chip terribl finish ',
 'arriv 1 40 wait 20 minut got tabl readi ',
 'ok reserv restaur half full ',
 'reason make us wait ',
 'order chip side dish look delici ',
 'tast overcook swim oil left ',
 'expect lot 10 ',
 'waiter ask everyth ok said realli like chip said funni love ',
 'offer us anyth els take bill ',
 'also leav tip look annoy ',
 'realli excit visit vega main fantast rest experi realli disappoint ']

## all the words other-than stop words after applying stemming

In [7]:
for stemmedword in stemmed_corpus:
    print(stemmedword, end=" ")

came lunch sister  love thai style main amaz lot flavour impress vegetarian restaur  servic averag chip terribl finish  arriv 1 40 wait 20 minut got tabl readi  ok reserv restaur half full  reason make us wait  order chip side dish look delici  tast overcook swim oil left  expect lot 10  waiter ask everyth ok said realli like chip said funni love  offer us anyth els take bill  also leav tip look annoy  realli excit visit vega main fantast rest experi realli disappoint  

## Example to unserstand what a stemmer do

In [9]:
stemmer.stem("history")
#base word

'histori'

## all the filtered stop words after applying stemming

In [10]:
for stopwords in stop_words:
    print(stopwords,end=" ")

for with my we our which were with of very for a but the was below and the were too to when we at we had to while they our so we didn t have a but the was only there was no to at all we the as a and they but when we them they were and in so we most of them we a more for when the if was we we didn t the and he that s i them and that was it he didn t or them off our when we didn t a he i was about and the were just but the of the was 

## Using lemmatizer to get the meaningful word of a word

In [11]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
stemmed_corpus

['came lunch sister ',
 'love thai style main amaz lot flavour impress vegetarian restaur ',
 'servic averag chip terribl finish ',
 'arriv 1 40 wait 20 minut got tabl readi ',
 'ok reserv restaur half full ',
 'reason make us wait ',
 'order chip side dish look delici ',
 'tast overcook swim oil left ',
 'expect lot 10 ',
 'waiter ask everyth ok said realli like chip said funni love ',
 'offer us anyth els take bill ',
 'also leav tip look annoy ',
 'realli excit visit vega main fantast rest experi realli disappoint ']

In [14]:
stemmed_lemmatized_corpus=[]
for corp in stemmed_corpus:
    lemmatized_doc = ""
    words = nltk.word_tokenize(corp)
    
    for word in words:
        lemmatized_doc +=lemmatizer.lemmatize(word)+ " "        
    stemmed_lemmatized_corpus.append(lemmatized_doc)

print(stemmed_lemmatized_corpus)
    

['came lunch sister ', 'love thai style main amaz lot flavour impress vegetarian restaur ', 'servic averag chip terribl finish ', 'arriv 1 40 wait 20 minut got tabl readi ', 'ok reserv restaur half full ', 'reason make u wait ', 'order chip side dish look delici ', 'tast overcook swim oil left ', 'expect lot 10 ', 'waiter ask everyth ok said realli like chip said funni love ', 'offer u anyth el take bill ', 'also leav tip look annoy ', 'realli excit visit vega main fantast rest experi realli disappoint ']


In [15]:
from sklearn.feature_extraction.text import CountVectorizer
binary_BOW_vectorizer= CountVectorizer(stop_words='english',binary=True)

#we can apply ngrams methods in BOW by just passing ngram parameter 

In [16]:
# test code now implemeted above 
from nltk.stem import PorterStemmer  # allows to redefine the word on its root word
from nltk.corpus import stopwords    # are the words that donot have direct impact on semantics of documents 

stemmer = PorterStemmer()
stemmed_corpus = []
stop_words = []
# separating words and stop words and applying stemming on words.
for sent in corpus:
    doc = ""
    words = nltk.word_tokenize(sent)
    for word in words:
        if word not in stopwords.words('english'):
            doc +=stemmer.stem(word)+ " "
            
        else:
            stop_words.append(word)
    stemmed_corpus.append(doc)

print(stemmed_corpus)

['came lunch sister ', 'love thai style main amaz lot flavour impress vegetarian restaur ', 'servic averag chip terribl finish ', 'arriv 1 40 wait 20 minut got tabl readi ', 'ok reserv restaur half full ', 'reason make us wait ', 'order chip side dish look delici ', 'tast overcook swim oil left ', 'expect lot 10 ', 'waiter ask everyth ok said realli like chip said funni love ', 'offer us anyth els take bill ', 'also leav tip look annoy ', 'realli excit visit vega main fantast rest experi realli disappoint ']


In [17]:
X = binary_BOW_vectorizer.fit_transform(stemmed_lemmatized_corpus)
X

<13x62 sparse matrix of type '<class 'numpy.int64'>'
	with 72 stored elements in Compressed Sparse Row format>

In [18]:
vocabulary  = binary_BOW_vectorizer.vocabulary_
vocabulary

{'came': 9,
 'lunch': 32,
 'sister': 49,
 'love': 31,
 'thai': 55,
 'style': 50,
 'main': 33,
 'amaz': 3,
 'lot': 30,
 'flavour': 21,
 'impress': 25,
 'vegetarian': 58,
 'restaur': 46,
 'servic': 48,
 'averag': 8,
 'chip': 10,
 'terribl': 54,
 'finish': 20,
 'arriv': 6,
 '40': 2,
 'wait': 60,
 '20': 1,
 'minut': 35,
 'got': 23,
 'tabl': 52,
 'readi': 41,
 'ok': 38,
 'reserv': 44,
 'half': 24,
 'reason': 43,
 'make': 34,
 'order': 39,
 'dish': 13,
 'look': 29,
 'delici': 11,
 'tast': 53,
 'overcook': 40,
 'swim': 51,
 'oil': 37,
 'left': 27,
 'expect': 17,
 '10': 0,
 'waiter': 61,
 'ask': 7,
 'everyth': 15,
 'said': 47,
 'realli': 42,
 'like': 28,
 'funni': 22,
 'offer': 36,
 'anyth': 5,
 'el': 14,
 'leav': 26,
 'tip': 56,
 'annoy': 4,
 'excit': 16,
 'visit': 59,
 'vega': 57,
 'fantast': 19,
 'rest': 45,
 'experi': 18,
 'disappoint': 12}

In [19]:
len(vocabulary)

62

In [20]:
print(X[0].toarray().shape)
print(X[10].toarray().shape)
print(X[12].toarray().shape)
print(X[3].toarray().shape)

(1, 62)
(1, 62)
(1, 62)
(1, 62)


In [21]:
print(1, corpus[0])
print(2, corpus[1])
print(3,corpus[2])
print(4, corpus[3])

1  came for lunch with my sister 
2 we loved our thai style mains which were amazing with lots of flavour  very impressive for a vegetarian restaurant 
3 but the service was below average and the chips were too terrible to finish 
4 when we arrived at 1 40  we had to wait 20 minutes while they got our table ready 


In [22]:
for i, code in enumerate(X[0].toarray()[0]):
    if code==1:
        print(i,code)

9 1
32 1
49 1


In [23]:
sentences

['\nCame for lunch with my sister.',
 'We loved our Thai-style mains which were amazing with lots of flavour, very impressive for a vegetarian restaurant.',
 'But the service was below average and the chips were too terrible to finish.',
 'When we arrived at 1.40, we had to wait 20 minutes while they got our table ready.',
 "OK, so we didn't have a reservation, but the restaurant was only half full.",
 'There was no reason to make us wait at all.',
 'We ordered the chips as a side dish and they looked delicious.',
 'But, when we tasted them, they were overcooked and swimming in oil so we left most of them.',
 'We expected a lot more for $10!',
 "When the waiter asked if everything was ok, we said we really didn't like the chips and he said 'That's funny, I love them' and that was it.",
 "He didn't offer us anything else or take them off our bill.",
 "Also, when we didn't leave a tip, he looked annoyed.",
 'I was really excited about visiting Vega, and the mains were just fantastic, but

## All the above applied process in one code

In [24]:
import re
from nltk.corpus import stopwords
corpus = []
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z0-9]',' ',sentences[i]).lower().split()
    
    review=[lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review=[stemmer.stem(word) for word in review]
    
    review = ' '.join(review)
    
    corpus.append(review)
corpus

['came lunch sister',
 'love thai style main amaz lot flavour impress vegetarian restaur',
 'servic averag chip terribl finish',
 'arriv 1 40 wait 20 minut got tabl readi',
 'ok reserv restaur half full',
 'reason make u wait',
 'order chip side dish look delici',
 'tast overcook swim oil left',
 'expect lot 10',
 'waiter ask everyth ok said realli like chip said funni love',
 'offer u anyth els take bill',
 'also leav tip look annoy',
 'realli excit visit vega main fantast rest experi realli disappoint']

# TFIDF

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
cv = TfidfVectorizer()
X = cv.fit_transform(stemmed_lemmatized_corpus)

In [27]:
stemmed_lemmatized_corpus[0]

'came lunch sister '

In [28]:
X[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

In [29]:
cv = TfidfVectorizer(ngram_range=(3,3))
X = cv.fit_transform(stemmed_lemmatized_corpus)
X[0].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]])

In [30]:
cv = TfidfVectorizer(ngram_range=(3,3),max_features=10)
X = cv.fit_transform(stemmed_lemmatized_corpus)
X[0].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])