# Vectorization

### Vectorization is the process of encoding text as integers to create feature vectors

There are multiple ways of performing vectorization 

In [8]:
import pandas as pd
import re
import string
import nltk
import os
base_path = "datasets"

pd.set_option("display.max_colwidth",100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
data = pd.read_csv(os.path.join(base_path,"SMSSpamCollection.tsv"),sep='\t',names=['labels','body_text'])
data.head()

Unnamed: 0,labels,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [9]:
def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text    

### 1. Count Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
# fit only does the learning but does not change the data. fit_transofrm, also change the data - change the data to vectors
x_count = count_vect.fit_transform(data['body_text'])
# (rows,columns) - rows is the length of the dataset, and columns is the number of unique words in the dataset
print(x_count.shape)
print(count_vect.get_feature_names_out())

(5568, 8191)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [14]:
# Do the same process as the cell above but on a smaller dataset
data_sample = data[0:20]
count_vect_sample = CountVectorizer(analyzer=clean_text)
x_count_sample = count_vect_sample.fit_transform(data_sample['body_text'])
print(x_count_sample.shape)
print(count_vect_sample.get_feature_names_out())

(20, 220)
['08002986030' '08452810075over18' '09061701461' '1' '100' '100000' '11'
 '12' '150pday' '16' '2' '20000' '2005' '21st' '3' '4' '4403ldnw1a7rw18'
 '4txtú120' '6day' '81010' '87077' '87121' '87575' '9' '900' 'a' 'aft'
 'aid' 'alreadi' 'anymor' 'appli' 'ard' 'around' 'as' 'b' 'bless'
 'breather' 'brother' 'call' 'caller' 'callertun' 'camera' 'cash' 'chanc'
 'claim' 'click' 'co' 'code' 'colour' 'comin' 'comp' 'copi' 'cost'
 'credit' 'cri' 'csh11' 'cup' 'custom' 'da' 'date' 'dont' 'eg' 'eh'
 'england' 'enough' 'entitl' 'entri' 'even' 'fa' 'feel' 'final' 'fine'
 'finish' 'first' 'free' 'friend' 'from' 'fulfil' 'go' 'goalsteam' 'goe'
 'gonna' 'gota' 'grant' 'ha' 'had' 'have' 'he' 'help' 'hl' 'home' 'hour'
 'httpwap' 'i' 'im' 'info' 'is' 'ive' 'jackpot' 'joke' 'k' 'kim' 'kl341'
 'lar' 'latest' 'lccltd' 'like' 'link' 'live' 'lor' 'lunch' 'macedonia'
 'make' 'may' 'mell' 'membership' 'messag' 'minnaminungint' 'miss' 'mobil'
 'month' 'nah' 'name' 'nation' 'naughti' 'network' 'news' 'ne

In [23]:
# The data is a sparse matrix - A sparse matrix is a matrix which most of the entries are 0. It saves 1 only in the cells with non-zero elements and thus make is more storage efficient
x_counts_df = pd.DataFrame(x_count_sample.toarray())
x_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210,211,212,213,214,215,216,217,218,219
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# Set the column names in the x_count_df
x_counts_df.columns = count_vect_sample.get_feature_names_out()
x_counts_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


### N-grams

N-Grams creates a matrix like we've seen before, but instead of the columns represents a single term, it represents all combinations of sentence length n

* N can be any number - it will construct a sentence from n tokens, and the tokens will be a substring of a sentence

In [38]:
def clean_text_2(text):
    text = "".join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    # instead of joining just individual characters, we are joining tokens rebuild the sentence
    text = " ".join([ps.stem(word) for word in tokens if word not in stopwords])
    return text
data['cleaned_text'] = data['body_text'].apply(lambda x: clean_text_2(x))
data.head()

Unnamed: 0,labels,body_text,cleaned_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,ive search right word thank breather i promis wont take help grant fulfil promis you wonder bles...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd...
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah i dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak they treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will


In [43]:
data_sample = data[0:20]
# Initializing a count vectorizer with a range of n-grams to look for. sklearn will find the optimal number for us
ngram_vect_sample = CountVectorizer(ngram_range=(2,2))
x_counts_sample = ngram_vect_sample.fit_transform(data_sample['cleaned_text'])
print(x_counts_sample.shape)
print(ngram_vect_sample.get_feature_names_out())

(20, 229)
['09061701461 claim' '100 20000' '100000 prize' '11 month' '12 hour'
 '150pday 6day' '16 tsandc' '20000 pound' '2005 text' '21st may'
 '4txtú120 poboxox36504w45wq' '6day 16' '81010 tc' '87077 eg'
 '87077 trywal' '87121 receiv' '87575 cost' '900 prize' 'aft finish'
 'aid patent' 'anymor tonight' 'appli 08452810075over18' 'appli repli'
 'ard smth' 'around though' 'as per' 'as valu' 'bless time'
 'breather promis' 'brother like' 'call 09061701461' 'call the'
 'caller press' 'callertun caller' 'camera free' 'cash from' 'chanc win'
 'claim call' 'claim code' 'claim no' 'click httpwap' 'click wap'
 'co free' 'code kl341' 'colour mobil' 'comp win' 'copi friend'
 'cost 150pday' 'credit click' 'cri enough' 'csh11 send' 'cup final'
 'custom select' 'da stock' 'date on' 'dont miss' 'dont think' 'dont want'
 'eg england' 'eh rememb' 'england 87077' 'england macedonia'
 'enough today' 'entitl updat' 'entri questionstd' 'entri wkli'
 'even brother' 'fa 87121' 'fa cup' 'feel that' 'final tk

In [44]:
x_counts_df = pd.DataFrame(x_counts_sample.toarray())
x_counts_df.columns = ngram_vect_sample.get_feature_names()
x_counts_df



Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 month,12 hour,150pday 6day,16 tsandc,20000 pound,2005 text,21st may,...,wkli comp,wonder bless,wont take,word claim,word thank,wwwdbuknet lccltd,xxxmobilemovieclub to,ye he,you week,you wonder
0,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3. TF-IDF

Uses a formula to count how important is a word to a sentence. It measures the frequency of every word in a sentence.The rarer the word is, it makes the number be very HIGH. Rare words might have a more important meaning that frequent words.

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
x_tfidf = tfidf_vect.fit_transform(data['body_text'])

print(x_tfidf.shape)
print(tfidf_vect.get_feature_names_out())

(5568, 8191)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [53]:
data_sample = data[0:20]

tfidf_vect_sample = TfidfVectorizer(analyzer=clean_text)
x_tfidf_sample = tfidf_vect_sample.fit_transform(data_sample['body_text'])

print(x_tfidf_sample.shape)
print(tfidf_vect_sample.get_feature_names_out())


(20, 220)
['08002986030' '08452810075over18' '09061701461' '1' '100' '100000' '11'
 '12' '150pday' '16' '2' '20000' '2005' '21st' '3' '4' '4403ldnw1a7rw18'
 '4txtú120' '6day' '81010' '87077' '87121' '87575' '9' '900' 'a' 'aft'
 'aid' 'alreadi' 'anymor' 'appli' 'ard' 'around' 'as' 'b' 'bless'
 'breather' 'brother' 'call' 'caller' 'callertun' 'camera' 'cash' 'chanc'
 'claim' 'click' 'co' 'code' 'colour' 'comin' 'comp' 'copi' 'cost'
 'credit' 'cri' 'csh11' 'cup' 'custom' 'da' 'date' 'dont' 'eg' 'eh'
 'england' 'enough' 'entitl' 'entri' 'even' 'fa' 'feel' 'final' 'fine'
 'finish' 'first' 'free' 'friend' 'from' 'fulfil' 'go' 'goalsteam' 'goe'
 'gonna' 'gota' 'grant' 'ha' 'had' 'have' 'he' 'help' 'hl' 'home' 'hour'
 'httpwap' 'i' 'im' 'info' 'is' 'ive' 'jackpot' 'joke' 'k' 'kim' 'kl341'
 'lar' 'latest' 'lccltd' 'like' 'link' 'live' 'lor' 'lunch' 'macedonia'
 'make' 'may' 'mell' 'membership' 'messag' 'minnaminungint' 'miss' 'mobil'
 'month' 'nah' 'name' 'nation' 'naughti' 'network' 'news' 'ne

In [54]:
x_tfidf_df = pd.DataFrame(x_tfidf_sample.toarray())
x_tfidf_df.columns = tfidf_vect_sample.get_feature_names_out()
x_tfidf_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,you,ü
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.230352,0.230352,0.202483,0.0,0.0,0.0,0.0,0.202483,0.0
1,0.0,0.198986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.198986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.2226,0.0,0.0,0.0,0.0,0.2226,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.190382,0.0,0.0,0.0,0.0,0.0,0.190382,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.219424,0.0,0.0,0.0,0.219424,0.192877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
