In [4]:
import pandas as pd

In [5]:
df = pd.read_table('smsspamcollection/SMSSpamCollection', header = None, names = ['label', 'sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
category_mapping = {
    'ham' : 0,
    'spam' : 1
}

df['label'] = df.label.map(category_mapping)
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


### implementing Bag Of Words

In [7]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

#### Make all lower case

In [8]:
lower_case_documents = []

for i in documents:
    lower_case_documents.append(i.lower())

lower_case_documents    

['hello, how are you!',
 'win money, win from home.',
 'call me now.',
 'hello, call hello you tomorrow?']

#### Remove Punctuations

In [9]:
sans_punctuation_documents = []

for i in lower_case_documents:
    sans_punctuation_documents.append(i.replace(',','').replace('.','').replace('!','').replace('?',''))

sans_punctuation_documents    

['hello how are you',
 'win money win from home',
 'call me now',
 'hello call hello you tomorrow']

#### tokenization

In [10]:
preprocessed_documents = []

for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split())

preprocessed_documents    

[['hello', 'how', 'are', 'you'],
 ['win', 'money', 'win', 'from', 'home'],
 ['call', 'me', 'now'],
 ['hello', 'call', 'hello', 'you', 'tomorrow']]

#### count frequencies

In [11]:
from collections import Counter
frequency_list = []

for i in preprocessed_documents:
    frequency_list.append(Counter(i))

frequency_list   

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]

### implementing Bag Of Words in sklearn

In [12]:
documents = ['Hello, how are you!',
                'Win money, win from home.',
                'Call me now.',
                'Hello, Call hello you tomorrow?']

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
count_vector

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [14]:
count_vector.fit(documents)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [15]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

### Splitting data to training and testing

In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['sms_message'],df['label'],random_state = 1)

print('total rows : ',df.shape[0])
print('training set : ',x_train.shape[0])
print('testing set : ',x_test.shape[0])

total rows :  5572
training set :  4179
testing set :  1393


In [28]:
count_vector = CountVectorizer()

training_data = count_vector.fit_transform(x_train)

testing_data = count_vector.transform(x_test)
