## Count vectorization

## Read in text 

In [3]:
import re
import pandas as pd
import nltk
import string

In [8]:
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv",sep='\t', header=None)
data.columns = ['Label', 'body_text']

## create function to remove puntuation, tokenize, remove stopwords and stem

In [9]:
def clean_text(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    tokenize = re.split('\W+', text)
    text = [word for word in tokenize if word not in stopwords]
    text = [ps.stem(word) for word in ttext]
    return text

## Apply CountVectorizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
x_count = count_vect.fit_transform(data['body_text'])
print(x_count.shape)

(5568, 8337)


## Apply CountVectorizer to smaller sample

In [15]:
data_sample = data[0:20]
count_vect_sample = CountVectorizer(analyzer=clean_text)
x_count_sample = count_vect_sample.fit_transform(data_sample['body_text'])
print(x_count_sample.shape)
print(count_vect_sample.get_feature_names())

(20, 221)
['08002986030', '08452810075over18', '09061701461', '1', '100', '100000', '11', '12', '150pday', '16', '2', '20000', '2005', '21st', '3', '4', '4403ldnw1a7rw18', '4txtú120', '6day', '81010', '87077', '87121', '87575', '9', '900', 'A', 'As', 'Co', 'Eh', 'FA', 'HL', 'He', 'I', 'Im', 'Is', 'No', 'ON', 'Oh', 'R', 'So', 'TC', 'To', 'U', 'aft', 'aid', 'alreadi', 'anymor', 'appli', 'ard', 'around', 'b', 'bless', 'breather', 'brother', 'call', 'caller', 'callertun', 'camera', 'cash', 'chanc', 'claim', 'click', 'code', 'colour', 'comin', 'comp', 'copi', 'cost', 'credit', 'cri', 'csh11', 'cup', 'custom', 'da', 'date', 'dont', 'eg', 'england', 'enough', 'entitl', 'entri', 'even', 'feel', 'final', 'fine', 'finish', 'first', 'free', 'friend', 'from', 'fulfil', 'go', 'goalsteam', 'goe', 'gonna', 'gota', 'grant', 'ha', 'had', 'have', 'help', 'home', 'hour', 'httpwap', 'info', 'ive', 'jackpot', 'joke', 'k', 'kim', 'kl341', 'lar', 'latest', 'lccltd', 'like', 'link', 'live', 'lor', 'lunch', 'm



## Vectorizers output sparse matrices

In [17]:
x_count_sample

<20x221 sparse matrix of type '<class 'numpy.int64'>'
	with 253 stored elements in Compressed Sparse Row format>

In [19]:
x_count_df = pd.DataFrame(x_count_sample.toarray())
x_count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,211,212,213,214,215,216,217,218,219,220
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
x_count_df.columns = count_vect_sample.get_feature_names()



In [21]:
x_count_df.head()

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
