# NLP Intro

In [22]:
import numpy as np

review1 = "I LOVE this book about love"
review2 = "No this book was okay"

all_words = [text.lower().split() for text in [review1, review2]]
print(all_words)

# flatten this list
words = [word for text in all_words for word in text]
unique_words = set(words)

[['i', 'love', 'this', 'book', 'about', 'love'], ['no', 'this', 'book', 'was', 'okay']]


In [25]:
vocabulary = {word : index for index, word in enumerate(unique_words)}
print(vocabulary)

def term_frequency_vectorizer(document, vocabulary):
    term_frequency = np.zeros(len(vocabulary))

    for word in document.lower().split():
        index = vocabulary[word]
        term_frequency[index] += 1

    return term_frequency

{'book': 0, 'i': 1, 'this': 2, 'love': 3, 'no': 4, 'was': 5, 'okay': 6, 'about': 7}


In [26]:
r1_f = term_frequency_vectorizer(review1,vocabulary)
r2_f = term_frequency_vectorizer(review2,vocabulary)


array([1., 0., 1., 0., 1., 1., 1., 0.])

In [29]:
import pandas as pd

bag_of_word = pd.DataFrame(
    [r1_f, r2_f], columns=vocabulary.keys(), dtype= int
)

bag_of_word

Unnamed: 0,book,i,this,love,no,was,okay,about
0,1,1,1,2,0,0,0,1
1,1,0,1,0,1,1,1,0


## Feature extraction with sklearn

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1, review2])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names_out()

(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1]]),
 array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object))

In [35]:
bag_of_words = pd.DataFrame(bag_of_words_sparse.todense(), columns= count_vectorizer.get_feature_names_out())

## TF-IDF

In [36]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bag_of_words_sparse)
tfidf.todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])