# NLP Intro

In [15]:
import numpy as np

review1 = "I LOVE this book about love"
review2 = "No this book was okay"

all_words = [text.lower().split() for text in [review1, review2]]
print(all_words)


[['i', 'love', 'this', 'book', 'about', 'love'], ['no', 'this', 'book', 'was', 'okay']]


In [16]:
# wants to flatten the 2D list to 1D list
all_words = [word for text in all_words for word in text]
all_words

['i',
 'love',
 'this',
 'book',
 'about',
 'love',
 'no',
 'this',
 'book',
 'was',
 'okay']

In [22]:
unique_words = set(all_words)
type(unique_words), unique_words

(set, {'about', 'book', 'i', 'love', 'no', 'okay', 'this', 'was'})

In [24]:
vocabulary = {word: index for index, word in enumerate(unique_words)}
print(vocabulary)



{'about': 0, 'love': 1, 'this': 2, 'i': 3, 'okay': 4, 'was': 5, 'no': 6, 'book': 7}


In [27]:
def term_frequency_vectorizer(document, vocabulary = vocabulary):
    term_frequency = np.zeros(len(vocabulary))

    for word in document.lower().split():
        index = vocabulary[word]
        term_frequency[index] += 1
    
    return term_frequency

review1_term_frequency = term_frequency_vectorizer(review1)
review2_term_frequency = term_frequency_vectorizer(review2)

print(vocabulary)
print(review1)
print(review2)
review1_term_frequency, review2_term_frequency

{'about': 0, 'love': 1, 'this': 2, 'i': 3, 'okay': 4, 'was': 5, 'no': 6, 'book': 7}
I LOVE this book about love
No this book was okay


(array([1., 2., 1., 1., 0., 0., 0., 1.]),
 array([0., 0., 1., 0., 1., 1., 1., 1.]))

In [30]:
import pandas as pd

bag_of_words = pd.DataFrame([review1_term_frequency, review2_term_frequency], columns=vocabulary.keys())
bag_of_words

Unnamed: 0,about,love,this,i,okay,was,no,book
0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


## Bag of words sklearn

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1, review2])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names_out()



(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1]], dtype=int64),
 array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object))

In [36]:
bag_of_words = pd.DataFrame(bag_of_words_sparse.todense(), columns=count_vectorizer.get_feature_names_out())
bag_of_words

Unnamed: 0,about,book,love,no,okay,this,was
0,1,1,2,0,0,1,0
1,0,1,0,1,1,1,1


## TF-IDF

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform([review1, review2]).todense()

# the more common the word is in a specific but not common in the corpus of documents will have higher tf-idf value


matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])