### Bag of words model

In [27]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [28]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [29]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']


#### Creating bag of words model using count vectorizer function

In [30]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the row number and column number of the cells which have 1 as value

  (0, 4)	1
  (0, 3)	1
  (0, 10)	1
  (0, 2)	1
  (1, 0)	1
  (1, 7)	1
  (1, 1)	1
  (1, 9)	1
  (1, 4)	1
  (2, 11)	1
  (2, 8)	1
  (2, 5)	1
  (2, 6)	1


In [31]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 0 1 0]
 [1 1 0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1]]


In [32]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(3, 12)
['actors', 'depends', 'gangs', 'great', 'movie', 'movies', 'new', 'performance', 'releasing', 'success', 'wasseypur', 'week']


### Let's create a bag of words model on the spam dataset.

In [33]:
from pathlib import Path

In [34]:
folder = Path("C:/Upgrad Projects/NLP-M1")

In [35]:
# load data
spam = pd.read_csv(folder/"SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [36]:
spam = spam.iloc[0:50,:]
print(spam)

   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
5   spam   
6    ham   
7    ham   
8   spam   
9   spam   
10   ham   
11  spam   
12  spam   
13   ham   
14   ham   
15  spam   
16   ham   
17   ham   
18   ham   
19  spam   
20   ham   
21   ham   
22   ham   
23   ham   
24   ham   
25   ham   
26   ham   
27   ham   
28   ham   
29   ham   
30   ham   
31   ham   
32   ham   
33   ham   
34  spam   
35   ham   
36   ham   
37   ham   
38   ham   
39   ham   
40   ham   
41   ham   
42  spam   
43   ham   
44   ham   
45   ham   
46   ham   
47   ham   
48   ham   
49   ham   

                                                                                                message  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 200

In [37]:
# extract the messages from the dataframe
messages = spam.message
#print(messages)

In [38]:
# convert messages into list
messages = [message for message in messages]
#print(messages)

In [39]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
#print(messages)

In [40]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)
print(bow_model.toarray())

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [41]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(50, 381)
['000', '07732584351', '08000930705', '08002986030', '08452810075over18', '09061701461', '100', '11', '12', '150p', '16', '20', '2005', '21st', '2nd', '4403ldnw1a7rw18', '4txt', '50', '6days', '81010', '87077', '87121', '87575', '8am', '900', 'abiola', 'actin', 'aft', 'ahead', 'ahhh', 'aids', 'already', 'alright', 'always', 'amore', 'amp', 'anymore', 'anything', 'apologetic', 'apply', 'arabian', 'ard', 'around', 'ask', 'available', 'back', 'badly', 'bit', 'blessing', 'breather', 'brother', 'buffet', 'bugis', 'burns', 'bus', 'ca', 'call', 'callers', 'callertune', 'calls', 'camcorder', 'camera', 'car', 'cash', 'catch', 'caught', 'chances', 'charged', 'cheers', 'chgs', 'child', 'cine', 'claim', 'clear', 'click', 'co', 'code', 'colour', 'com', 'comin', 'comp', 'confirm', 'convincing', 'copy', 'cost', 'could', 'crave', 'crazy', 'credit', 'cried', 'csh11', 'cup', 'cuppa', 'customer', 'da', 'darling', 'date', 'day', 'dbuk', 'decide', 'decided', 'delivery', 'dinner', 'done', 'dont', 

* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

## Stemming and lemmatising

In [42]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    
    return document

### Bag of words model on stemmed messages

In [43]:
# stem messages
messages = [preprocess(message, stem=True) for message in spam.message]

# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [44]:
# look at the dataframe
bowstemdf = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [45]:
bowstemdf.shape

(50, 359)

In [46]:
# token names
print(vectorizer.get_feature_names())

['000', '07732584351', '08000930705', '08002986030', '08452810075over18', '09061701461', '100', '11', '12', '150p', '16', '20', '2005', '21st', '2nd', '4403ldnw1a7rw18', '4txt', '50', '6day', '81010', '87077', '87121', '87575', '8am', '900', 'abiola', 'actin', 'aft', 'ahead', 'ahhh', 'aid', 'alreadi', 'alright', 'alway', 'amor', 'amp', 'anymor', 'anyth', 'apologet', 'appli', 'arabian', 'ard', 'around', 'ask', 'avail', 'back', 'badli', 'bit', 'bless', 'breather', 'brother', 'bu', 'buffet', 'bugi', 'burn', 'ca', 'call', 'caller', 'callertun', 'calls', 'camcord', 'camera', 'car', 'cash', 'catch', 'caught', 'chanc', 'charg', 'cheer', 'chg', 'child', 'cine', 'claim', 'clear', 'click', 'co', 'code', 'colour', 'com', 'comin', 'comp', 'confirm', 'convinc', 'copi', 'cost', 'could', 'crave', 'crazy', 'credit', 'cri', 'csh11', 'cup', 'cuppa', 'custom', 'da', 'darl', 'date', 'day', 'dbuk', 'decid', 'deliveri', 'dinner', 'done', 'dont', 'dun', 'earli', 'eat', 'eg', 'egg', 'eh', 'endow', 'england', 

### 359 tokens after stemming the messages as compared to 381 tokens without stemming.

### Let's try lemmatizing the messages.

In [47]:
# lemmatise messages
messages = [preprocess(message, stem=False) for message in spam.message]

# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [49]:
# look at the dataframe
bowlemdf = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())

In [50]:
bowlemdf.shape

(50, 363)

In [51]:
# token names
print(vectorizer.get_feature_names())

['000', '07732584351', '08000930705', '08002986030', '08452810075over18', '09061701461', '100', '11', '12', '150p', '16', '20', '2005', '21st', '2nd', '4403ldnw1a7rw18', '4txt', '50', '6days', '81010', '87077', '87121', '87575', '8am', '900', 'abiola', 'actin', 'aft', 'ahead', 'ahhh', 'aid', 'already', 'alright', 'always', 'amore', 'amp', 'anymore', 'anything', 'apologetic', 'apply', 'arabian', 'ard', 'around', 'ask', 'available', 'back', 'badly', 'bite', 'bless', 'breather', 'brother', 'buffet', 'bugis', 'burn', 'bus', 'ca', 'call', 'callers', 'callertune', 'calls', 'camcorder', 'camera', 'car', 'cash', 'catch', 'chance', 'charge', 'cheer', 'chgs', 'child', 'cine', 'claim', 'clear', 'click', 'co', 'code', 'colour', 'com', 'comin', 'comp', 'confirm', 'convince', 'copy', 'cost', 'could', 'crave', 'crazy', 'credit', 'cry', 'csh11', 'cup', 'cuppa', 'customer', 'da', 'darling', 'date', 'day', 'dbuk', 'decide', 'delivery', 'dinner', 'do', 'dont', 'dun', 'early', 'eat', 'eg', 'egg', 'eh', 'e

### 363 tokens after lemmatizing the messages as compared to 381 tokens without lemmatising. But, on the other hand, stemmer reduces the token count to 359. Lemmatization doesn't work as expected because the data is very unclean.

In [52]:
print(len(vectorizer.get_feature_names()))

363


In [71]:
Document1 = "Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline"
Document2 = "The beer at Vapour, Bangalore was amazing. My favourites are the wheat beer and the ale beer."
Document3 = "Vapour, Bangalore has the best view in Bangalore."

In [72]:
documents = [Document1, Document2, Document3]

In [73]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document



In [74]:
documents = [preprocess(document) for document in documents]
print(documents)

['vapour , bangalore really great terrace seating awesome view bangalore skyline', 'beer vapour , bangalore amazing . favourites wheat beer ale beer .', 'vapour , bangalore best view bangalore .']


In [75]:
vectorizer = CountVectorizer()

In [76]:
bow_model = vectorizer.fit_transform(documents)

In [102]:
# look at the dataframe
bowdf = pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())
bowdf.head()

Unnamed: 0,ale,amazing,awesome,bangalore,beer,best,favourites,great,really,seating,skyline,terrace,vapour,view,wheat
0,0,0,1,2,0,0,0,1,1,1,1,1,1,1,0
1,1,1,0,1,3,0,1,0,0,0,0,0,1,0,1
2,0,0,0,2,0,1,0,0,0,0,0,0,1,1,0


In [129]:
tot_terms = bowdf.iloc[0].astype(bool).sum()
tot_terms

9

In [130]:
tfreq = bowdf.loc[0,'bangalore']
tfreq

2

In [132]:
tf = round(tfreq/tot_terms,2)
tf

0.22

In [133]:
tot_docs = len(bowdf.index)
tot_docs

3

In [134]:
def term_count(term):
    sum = 0
    if term > 0:
        sum = sum + 1
    
    return sum
            

In [135]:
dterm_cnt = bowdf.apply(lambda row: term_count(row['bangalore']), axis=1).sum()
dterm_cnt

3

In [137]:
dterm_cnt = bowdf['bangalore'].astype(bool).sum()

3

In [142]:
import math

In [144]:
idf = round(math.log10(tot_docs/dterm_cnt),2)
idf

0.0

In [146]:
tf_idf = round(tf * idf, 2)
tf_idf

0.0

In [241]:
def ctf(df, document_no,term):
    print("Calculating tf")
    tfreq     = df.loc[document_no,term]
    print(tfreq)
    tot_terms = df.iloc[document_no].sum()
    print(tot_terms)
    tf = round(tfreq/tot_terms,3)
    return tf

In [242]:
def cidf(df, term):
    print("Calculating idf")
    tot_docs = len(df.index)
    print(tot_docs)
    term_in_docs_cnt = df[term].astype(bool).sum()
    print(term_in_docs_cnt)
    idf = round(math.log10(tot_docs/term_in_docs_cnt),3)
    return idf

In [243]:
tf = ctf(bowdf, 0, 'bangalore')
tf

Calculating tf
2
10


0.20000000000000001

In [244]:
idf = cidf(bowdf, 'bangalore')
idf

Calculating idf
3
3


0.0

In [245]:
tf_idf = round(tf * idf, 2)
tf_idf

0.0

In [246]:
tf = ctf(bowdf, 1, 'beer')
print(tf)
idf = cidf(bowdf, 'beer')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
3
9
0.333
Calculating idf
3
1
0.477


0.159

In [247]:
bowdf.iloc[1].sum()

9

In [248]:
bowdf.loc[1,'beer']

3

In [249]:
tf = bowdf.loc[1,'beer']/bowdf.iloc[1].sum()
tf

0.33333333333333331

In [250]:
len(bowdf.index)

3

In [251]:
bowdf['beer'].astype(bool).sum()

1

In [252]:
idf = math.log10((len(bowdf.index)/bowdf['beer'].astype(bool).sum()))
idf

0.47712125471966244

In [253]:
tf_idf = format(tf * idf, '.3f')
tf_idf

'0.159'

In [254]:
tf = ctf(bowdf, 1, 'bangalore')
print(tf)
idf = cidf(bowdf, 'bangalore')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
1
9
0.111
Calculating idf
3
3
0.0


0.0

In [255]:
tf = ctf(bowdf, 1, 'vapour')
print(tf)
idf = cidf(bowdf, 'vapour')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
1
9
0.111
Calculating idf
3
3
0.0


0.0

In [256]:
tf = ctf(bowdf, 0, 'bangalore')
print(tf)
idf = cidf(bowdf, 'bangalore')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
2
10
0.2
Calculating idf
3
3
0.0


0.0

In [257]:
tf = ctf(bowdf, 1, 'bangalore')
print(tf)
idf = cidf(bowdf, 'bangalore')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
1
9
0.111
Calculating idf
3
3
0.0


0.0

In [258]:
tf = ctf(bowdf, 2, 'bangalore')
print(tf)
idf = cidf(bowdf, 'bangalore')
print(idf)
tf_idf = float(format(tf * idf, '.3f'))
tf_idf

Calculating tf
2
5
0.4
Calculating idf
3
3
0.0


0.0