### TF-IDF model

In [1]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [6]:
documents = ["Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.", "The success of a song depends on the music.", "There is a new movie releasing this week. The movie is fun to watch."]
print(documents)

['Gangs of Wasseypur is a great movie. Wasseypur is a town in Bihar.', 'The success of a song depends on the music.', 'There is a new movie releasing this week. The movie is fun to watch.']


In [1]:
# documents = ["Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline","The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.",             "Vapour, Bangalore has the best view in Bangalore."]
# print(documents)

In [8]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    # stem
    words = [stemmer.stem(word) for word in words]
    
    # join words to make sentence
    document = " ".join(words)
    
    return document

In [9]:
documents = [preprocess(document) for document in documents]
print(documents)

['gang wasseypur great movi . wasseypur town bihar .', 'success song depend music .', 'new movi releas week . movi fun watch .']


#### Creating bag of words model using count vectorizer function

In [10]:
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(documents)
print(tfidf_model)  # returns the row number and column number of cells which have 1 as value

  (0, 0)	0.3414262179382391
  (0, 11)	0.3414262179382391
  (0, 5)	0.2596634391575384
  (0, 4)	0.3414262179382391
  (0, 12)	0.6828524358764781
  (0, 3)	0.3414262179382391
  (1, 6)	0.5
  (1, 1)	0.5
  (1, 9)	0.5
  (1, 10)	0.5
  (2, 13)	0.369772375024391
  (2, 2)	0.369772375024391
  (2, 14)	0.369772375024391
  (2, 8)	0.369772375024391
  (2, 7)	0.369772375024391
  (2, 5)	0.5624428445132056


In [11]:
# print the full sparse matrix
print(tfidf_model.toarray())

[[0.34142622 0.         0.         0.34142622 0.34142622 0.25966344
  0.         0.         0.         0.         0.         0.34142622
  0.68285244 0.         0.        ]
 [0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.5        0.5        0.
  0.         0.         0.        ]
 [0.         0.         0.36977238 0.         0.         0.56244284
  0.         0.36977238 0.36977238 0.         0.         0.
  0.         0.36977238 0.36977238]]


In [12]:
pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,bihar,depend,fun,gang,great,movi,music,new,releas,song,success,town,wasseypur,watch,week
0,0.341426,0.0,0.0,0.341426,0.341426,0.259663,0.0,0.0,0.0,0.0,0.0,0.341426,0.682852,0.0,0.0
1,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0
2,0.0,0.0,0.369772,0.0,0.0,0.562443,0.0,0.369772,0.369772,0.0,0.0,0.0,0.0,0.369772,0.369772


### Let's create a tf-idf model on the spam dataset.

In [13]:
# load data
spam = pd.read_csv("SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [14]:
spam = spam.iloc[0:50,:]
print(spam)

   label  \
0    ham   
1    ham   
2   spam   
3    ham   
4    ham   
5   spam   
6    ham   
7    ham   
8   spam   
9   spam   
10   ham   
11  spam   
12  spam   
13   ham   
14   ham   
15  spam   
16   ham   
17   ham   
18   ham   
19  spam   
20   ham   
21   ham   
22   ham   
23   ham   
24   ham   
25   ham   
26   ham   
27   ham   
28   ham   
29   ham   
30   ham   
31   ham   
32   ham   
33   ham   
34  spam   
35   ham   
36   ham   
37   ham   
38   ham   
39   ham   
40   ham   
41   ham   
42  spam   
43   ham   
44   ham   
45   ham   
46   ham   
47   ham   
48   ham   
49   ham   

                                                                                                message  
0   Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                         Ok lar... Joking wif u oni...  
2   Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 200

In [15]:
# extract the messages from the dataframe
messages = [message for message in spam.message]
print(messages)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv", 'Even my brother is not like to speak with me. They treat me like aids patent.', "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 

In [16]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(messages)

['go jurong point , crazi .. avail bugi n great world la e buffet ... cine got amor wat ...', 'ok lar ... joke wif u oni ...', "free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question ( std txt rate ) & c 's appli 08452810075over18 's", 'u dun say earli hor ... u c alreadi say ...', "nah n't think goe usf , live around though", "freemsg hey darl 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chg send , £1.50 rcv", 'even brother like speak . treat like aid patent .', "per request 'mell mell ( oru minnaminungint nurungu vettam ) ' set callertun caller . press * 9 copi friend callertun", 'winner ! ! valu network custom select receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hour .', 'mobil 11 month ? u r entitl updat latest colour mobil camera free ! call mobil updat co free 08002986030', "'m gon na home soon n't want talk stuff anymor tonight , k ? 've cri enough today .", 'six chanc win cash ! 100 20,00

In [17]:
# bag of words model
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(messages)

In [18]:
# Let's look at the dataframe
tfidf = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())
tfidf

Unnamed: 0,000,07732584351,08000930705,08002986030,08452810075over18,09061701461,100,11,12,150p,...,worri,www,xuhui,xxx,xxxmobilemovieclub,ye,yeah,yummi,yup,ú1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.198998,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.257878,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.231227,0.0,0.0,0.231227,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.217862,0.0,0.0,0.0,0.217862,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# token names
print(vectorizer.get_feature_names())

['000', '07732584351', '08000930705', '08002986030', '08452810075over18', '09061701461', '100', '11', '12', '150p', '16', '20', '2005', '21st', '2nd', '4403ldnw1a7rw18', '4txt', '50', '6day', '81010', '87077', '87121', '87575', '8am', '900', 'abiola', 'actin', 'aft', 'ahead', 'ahhh', 'aid', 'alreadi', 'alright', 'alway', 'amor', 'amp', 'anymor', 'anyth', 'apologet', 'appli', 'arabian', 'ard', 'around', 'ask', 'avail', 'back', 'badli', 'bit', 'bless', 'breather', 'brother', 'bu', 'buffet', 'bugi', 'burn', 'ca', 'call', 'caller', 'callertun', 'camcord', 'camera', 'car', 'cash', 'catch', 'caught', 'chanc', 'charg', 'cheer', 'chg', 'child', 'cine', 'claim', 'clear', 'click', 'co', 'code', 'colour', 'com', 'comin', 'comp', 'confirm', 'convinc', 'copi', 'cost', 'could', 'crave', 'crazi', 'credit', 'cri', 'csh11', 'cup', 'cuppa', 'custom', 'da', 'darl', 'date', 'day', 'dbuk', 'decid', 'deliveri', 'dinner', 'done', 'dont', 'dun', 'earli', 'eat', 'eg', 'egg', 'eh', 'endow', 'england', 'enough',