# Bag Of Words

In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
path = '../All_data_sets/nlp/spam.csv'

In [4]:
df = pd.read_csv( path , encoding='iso-8859-1' , names=['target' , 'text','0','1','2'] )
df.head()

Unnamed: 0,target,text,0,1,2
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


In [5]:
df.shape

(5573, 5)

In [6]:
df.columns

Index(['target', 'text', '0', '1', '2'], dtype='object')

In [7]:
drop_columns = ['0', '1', '2']

In [8]:
df.drop(columns=drop_columns,inplace=True)

In [9]:
df

Unnamed: 0,target,text
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


In [10]:
df = df.iloc[ 1: , 0: ]

df

Unnamed: 0,target,text
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...
5569,ham,Will Ì_ b going to esplanade fr home?
5570,ham,"Pity, * was in mood for that. So...any other s..."
5571,ham,The guy did some bitching but I acted like i'd...


### Data cleaning and preprocessing

In [11]:
import nltk
import re

# remove the stop words
from nltk.corpus import stopwords

# reduce the words to its root word
from nltk.stem.porter import PorterStemmer

In [12]:
ps = PorterStemmer()

In [13]:
stop_words = stopwords.words('english')

In [14]:
def preprocessing( message ):

    # take only the words
    review = re.sub( '[^a-zA-Z]' , ' ' , message )

    # lower case the words
    review = review.lower()

    # split the words into list
    review = review.split()

    # apply stop words and stemming
    review = [ps.stem(word) for word in review if not word in set(stop_words)]

    # join the words to form the sentence
    review = ' '.join(review)
    
    # print(review)
    
    return review

In [15]:
df['cleaned_text'] = df['text'].apply( lambda x:preprocessing(x) )

In [16]:
df

Unnamed: 0,target,text,cleaned_text
1,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
2,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
4,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
5,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
...,...,...,...
5568,spam,This is the 2nd time we have tried 2 contact u...,nd time tri contact u u pound prize claim easi...
5569,ham,Will Ì_ b going to esplanade fr home?,b go esplanad fr home
5570,ham,"Pity, * was in mood for that. So...any other s...",piti mood suggest
5571,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interest buy someth els nex...


# bag of Words 

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer( max_features=3000 )

In [18]:
X = cv.fit_transform( df['cleaned_text'] ).toarray()

X.shape

(5572, 3000)

In [19]:
sentences = [
    'Marry likes to watch movies',
    'John likes movies too and john also likes to watch cricket',
    'Marry likes to watch football',
    'John like pizza',
    'John also like biryani',
    'But marry like only burgar',
    'John do not like burgar',
    "I love this movie",       
    "I hate this movie",       
    "This movie is fantastic"  
]

In [20]:
cleaned_sentences = []

for sent in sentences:
    review = preprocessing( sent )
    cleaned_sentences.append(review)

In [21]:
cleaned_sentences

['marri like watch movi',
 'john like movi john also like watch cricket',
 'marri like watch footbal',
 'john like pizza',
 'john also like biryani',
 'marri like burgar',
 'john like burgar',
 'love movi',
 'hate movi',
 'movi fantast']

In [22]:
cv = CountVectorizer(max_features=100)

In [23]:
X_vector = cv.fit_transform(cleaned_sentences).toarray()

X_vector.shape

(10, 14)

In [24]:
X_vector

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [25]:
cv.vocabulary_

{'marri': 10,
 'like': 8,
 'watch': 13,
 'movi': 11,
 'john': 7,
 'also': 0,
 'cricket': 3,
 'footbal': 5,
 'pizza': 12,
 'biryani': 1,
 'burgar': 2,
 'love': 9,
 'hate': 6,
 'fantast': 4}

In [26]:
cv.get_feature_names_out()

array(['also', 'biryani', 'burgar', 'cricket', 'fantast', 'footbal',
       'hate', 'john', 'like', 'love', 'marri', 'movi', 'pizza', 'watch'],
      dtype=object)

# using n-grams

In [27]:
# ngram_range takes the combination m and n
cv = CountVectorizer( max_features=100 , ngram_range=(2,3) )

In [28]:
X_vector = cv.fit_transform(cleaned_sentences).toarray()

X_vector.shape

(10, 29)

In [29]:
cv.get_feature_names_out()

array(['also like', 'also like biryani', 'also like watch', 'hate movi',
       'john also', 'john also like', 'john like', 'john like burgar',
       'john like movi', 'john like pizza', 'like biryani', 'like burgar',
       'like movi', 'like movi john', 'like pizza', 'like watch',
       'like watch cricket', 'like watch footbal', 'like watch movi',
       'love movi', 'marri like', 'marri like burgar', 'marri like watch',
       'movi fantast', 'movi john', 'movi john also', 'watch cricket',
       'watch footbal', 'watch movi'], dtype=object)

In [30]:
cv.vocabulary_

{'marri like': 20,
 'like watch': 15,
 'watch movi': 28,
 'marri like watch': 22,
 'like watch movi': 18,
 'john like': 6,
 'like movi': 12,
 'movi john': 24,
 'john also': 4,
 'also like': 0,
 'watch cricket': 26,
 'john like movi': 8,
 'like movi john': 13,
 'movi john also': 25,
 'john also like': 5,
 'also like watch': 2,
 'like watch cricket': 16,
 'watch footbal': 27,
 'like watch footbal': 17,
 'like pizza': 14,
 'john like pizza': 9,
 'like biryani': 10,
 'also like biryani': 1,
 'like burgar': 11,
 'marri like burgar': 21,
 'john like burgar': 7,
 'love movi': 19,
 'hate movi': 3,
 'movi fantast': 23}