# Tokenization

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize
from wandb import sklearn

In [2]:
corpus = """This is a testing paragraph. I'm feeling very excited about the mistake of joining St. Joseph's College in Chennai.\nLet's see how this mistake impacts my life"""

In [3]:
documents = sent_tokenize(corpus)

In [4]:
documents

['This is a testing paragraph.',
 "I'm feeling very excited about the mistake of joining St. Joseph's College in Chennai.",
 "Let's see how this mistake impacts my life"]

In [5]:
for doc in  documents:
    print(word_tokenize(doc))

['This', 'is', 'a', 'testing', 'paragraph', '.']
['I', "'m", 'feeling', 'very', 'excited', 'about', 'the', 'mistake', 'of', 'joining', 'St.', 'Joseph', "'s", 'College', 'in', 'Chennai', '.']
['Let', "'s", 'see', 'how', 'this', 'mistake', 'impacts', 'my', 'life']


In [6]:
from nltk.tokenize import wordpunct_tokenize


for doc in documents:
    print(wordpunct_tokenize(doc))

['This', 'is', 'a', 'testing', 'paragraph', '.']
['I', "'", 'm', 'feeling', 'very', 'excited', 'about', 'the', 'mistake', 'of', 'joining', 'St', '.', 'Joseph', "'", 's', 'College', 'in', 'Chennai', '.']
['Let', "'", 's', 'see', 'how', 'this', 'mistake', 'impacts', 'my', 'life']


In [7]:
treebankTokenizer = nltk.tokenize.TreebankWordTokenizer()
treebankDeTokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
for doc in documents:
    tokens = treebankTokenizer.tokenize(doc)
    print(tokens)
    print(treebankDeTokenizer.detokenize(tokens))
    print()
    print()

['This', 'is', 'a', 'testing', 'paragraph', '.']
This is a testing paragraph.


['I', "'m", 'feeling', 'very', 'excited', 'about', 'the', 'mistake', 'of', 'joining', 'St.', 'Joseph', "'s", 'College', 'in', 'Chennai', '.']
I'm feeling very excited about the mistake of joining St. Joseph's College in Chennai.


['Let', "'s", 'see', 'how', 'this', 'mistake', 'impacts', 'my', 'life']
Let's see how this mistake impacts my life




# Stemming

In [8]:
words = ["fairly", "amazingly", "wonderfully", "successfully", "awesome", "gluey", "history", "wordly", "graffiti", "happily", "flying", "denied", "agreed", "owned"]

In [9]:
from nltk.stem import *

In [10]:
stemmers = {
    "Porter": PorterStemmer(),
    "Lancaster": LancasterStemmer(),
    "Snowball": SnowballStemmer("english"),
    "ISRI": ISRIStemmer(),
    "RSLP": RSLPStemmer(),
    "Regexp": RegexpStemmer("ing$|s$|y$"),
    "ARLStem": ARLSTem(),
    "ARLSTem2": ARLSTem2(),
    "Cistem": Cistem()
}

In [11]:
for word in words:
    for stemmer in stemmers:
        print(f"Original Word: {word} | Stemmer: {stemmer} | Stemmed Word: {stemmers[stemmer].stem(word)}")
    print()

Original Word: fairly | Stemmer: Porter | Stemmed Word: fairli
Original Word: fairly | Stemmer: Lancaster | Stemmed Word: fair
Original Word: fairly | Stemmer: Snowball | Stemmed Word: fair
Original Word: fairly | Stemmer: ISRI | Stemmed Word: fairly
Original Word: fairly | Stemmer: RSLP | Stemmed Word: fairly
Original Word: fairly | Stemmer: Regexp | Stemmed Word: fairl
Original Word: fairly | Stemmer: ARLStem | Stemmed Word: fairly
Original Word: fairly | Stemmer: ARLSTem2 | Stemmed Word: fairly
Original Word: fairly | Stemmer: Cistem | Stemmed Word: fairly

Original Word: amazingly | Stemmer: Porter | Stemmed Word: amazingli
Original Word: amazingly | Stemmer: Lancaster | Stemmed Word: amaz
Original Word: amazingly | Stemmer: Snowball | Stemmed Word: amaz
Original Word: amazingly | Stemmer: ISRI | Stemmed Word: amazingly
Original Word: amazingly | Stemmer: RSLP | Stemmed Word: amazingly
Original Word: amazingly | Stemmer: Regexp | Stemmed Word: amazingl
Original Word: amazingly | St

# Lemmatization

In [12]:
from nltk.stem import WordNetLemmatizer

In [13]:
wordnet = WordNetLemmatizer()

In [14]:
for doc in documents:
    print(wordnet.lemmatize(doc)    )

This is a testing paragraph.
I'm feeling very excited about the mistake of joining St. Joseph's College in Chennai.
Let's see how this mistake impacts my life




# Bag Of Words

In [23]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import re

In [20]:
df = pd.read_csv("SpamClassifier-master/smsspamcollection/SMSSpamCollection", sep='\t', names=["label", "message"])

In [21]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
stopwords_list = nltk.corpus.stopwords.words('english')

In [52]:
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokens_list = tokenizer.tokenize(text)
    tokens_list = [lemmatizer.lemmatize(token) for token in tokens_list if token not in stopwords_list]
    return ' '.join(tokens_list)


In [54]:
vectorizer = CountVectorizer(lowercase=True, stop_words='english', binary=True, max_features=2000)

In [39]:
X = df['message']

In [53]:
X[0], preprocess_text(X[0])

('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'go jurong point crazy available bugis n great world la e buffet cine got amore wat')

In [55]:
X_bow = vectorizer.fit_transform(X)

In [56]:
X_bow.shape

(5572, 2000)

In [57]:
print(X_bow[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (1, 2000)>
  Coords	Values
  (0, 1332)	1
  (0, 466)	1
  (0, 234)	1
  (0, 325)	1
  (0, 774)	1
  (0, 1955)	1
  (0, 977)	1
  (0, 398)	1
  (0, 766)	1
  (0, 1894)	1


In [62]:
vectorizer.vocabulary_

{'point': 1332,
 'crazy': 466,
 'available': 234,
 'bugis': 325,
 'great': 774,
 'world': 1955,
 'la': 977,
 'cine': 398,
 'got': 766,
 'wat': 1894,
 'ok': 1237,
 'lar': 985,
 'joking': 945,
 'wif': 1924,
 'oni': 1244,
 'free': 701,
 'entry': 609,
 'wkly': 1939,
 'comp': 426,
 'win': 1928,
 'cup': 477,
 'final': 673,
 'tkts': 1759,
 'text': 1720,
 '87121': 123,
 'receive': 1412,
 'question': 1379,
 'std': 1643,
 'txt': 1807,
 'rate': 1394,
 'apply': 202,
 'dun': 577,
 'say': 1493,
 'early': 581,
 'nah': 1186,
 'don': 558,
 'think': 1735,
 'goes': 755,
 'usf': 1843,
 'lives': 1033,
 'freemsg': 703,
 'hey': 828,
 'darling': 497,
 'week': 1910,
 'word': 1949,
 'like': 1023,
 'fun': 722,
 'tb': 1700,
 'xxx': 1973,
 'send': 1521,
 '50': 92,
 'brother': 317,
 'speak': 1620,
 'treat': 1787,
 'request': 1442,
 'oru': 1260,
 'set': 1531,
 'callertune': 342,
 'callers': 341,
 'press': 1355,
 'copy': 450,
 'friends': 710,
 'winner': 1930,
 'valued': 1850,
 'network': 1198,
 'customer': 482,
 'sel

# N Grams Implementation


Let us implement N Grams with the CountVectorizer

In [65]:
n_gram_vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2), binary=True, preprocessor=preprocess_text)

In [66]:
texts = [
    "This is a beautiful day",
    "I love her very much, but she doesn't love me back",
    "This is all about practice"
]

In [67]:
text_processed =n_gram_vectorizer.fit_transform(texts)

In [71]:
print(text_processed[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3 stored elements and shape (1, 6)>
  Coords	Values
  (0, 0)	1
  (0, 2)	1
  (0, 1)	1


In [73]:
for i in texts:
    print(preprocess_text(i))

beautiful day
love much love back
practice


In [72]:
n_gram_vectorizer.vocabulary_

{'beautiful': 0,
 'day': 2,
 'beautiful day': 1,
 'love': 3,
 'love love': 4,
 'practice': 5}

In [78]:
n_gram_vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3), binary=True,
                                    preprocessor=preprocess_text)
text_processed = n_gram_vectorizer.fit_transform(texts)


In [79]:
print(text_processed[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3 stored elements and shape (1, 6)>
  Coords	Values
  (0, 0)	1
  (0, 2)	1
  (0, 1)	1


In [80]:
for i in texts:
    print(preprocess_text(i))

beautiful day
love much love back
practice


In [81]:
n_gram_vectorizer.vocabulary_

{'beautiful': 0,
 'day': 2,
 'beautiful day': 1,
 'love': 3,
 'love love': 4,
 'practice': 5}