# Module 12 Feature Vectors 

- 

In [1]:
from nltk.corpus import stopwords 
import string

### Given the following two reviews.

In [2]:
review_1 = "This blender is fantastic! It blends smoothly, is easy to clean, and delivers consistent results every time. Highly recommended!"

review_2 = "A great blender! It blends effortlessly, cleans easily, and produces smooth results consistently. Highly recommended for everyday use!"

In [3]:
english_stops = stopwords.words('english')

english_stops.append('br')

### Function returns a clean word list.

In [4]:
def my_clean_words(s):
    
    all_words = []
    
    review = s.translate(str.maketrans('','', string.punctuation))
    
    words = review.split()
    
    for w in words:
        w = w.lower()
        if w not in english_stops:
            all_words.append(w)
    return all_words

In [5]:
review_1_cleaned = my_clean_words(review_1)

In [6]:
review_2_cleaned = my_clean_words(review_2)

### Two cleaned word lists from two reviews

In [7]:
print(review_1_cleaned)

print(review_2_cleaned)

['blender', 'fantastic', 'blends', 'smoothly', 'easy', 'clean', 'delivers', 'consistent', 'results', 'every', 'time', 'highly', 'recommended']
['great', 'blender', 'blends', 'effortlessly', 'cleans', 'easily', 'produces', 'smooth', 'results', 'consistently', 'highly', 'recommended', 'everyday', 'use']


### Check individual size

In [8]:
print(len(review_1_cleaned))

print(len(review_2_cleaned))

13
14


In [9]:
review_1_cleaned + review_2_cleaned

['blender',
 'fantastic',
 'blends',
 'smoothly',
 'easy',
 'clean',
 'delivers',
 'consistent',
 'results',
 'every',
 'time',
 'highly',
 'recommended',
 'great',
 'blender',
 'blends',
 'effortlessly',
 'cleans',
 'easily',
 'produces',
 'smooth',
 'results',
 'consistently',
 'highly',
 'recommended',
 'everyday',
 'use']

### Find the size of distinct words.

In [10]:
len(set(review_1_cleaned+review_2_cleaned))

22

### find out the common words used in both reviews.

In [11]:
set(review_1_cleaned).intersection(set(review_2_cleaned))

{'blender', 'blends', 'highly', 'recommended', 'results'}

### Create attribute words

In [12]:
attribute_words = list(set(review_1_cleaned+review_2_cleaned))

In [13]:
attribute_words

['produces',
 'cleans',
 'every',
 'recommended',
 'fantastic',
 'consistent',
 'blender',
 'highly',
 'easily',
 'great',
 'consistently',
 'delivers',
 'use',
 'everyday',
 'easy',
 'blends',
 'clean',
 'smoothly',
 'smooth',
 'effortlessly',
 'results',
 'time']

### Create a vector (one hot encoding) for review 1

In [14]:
vector = []

for w in attribute_words:
    if w in review_1_cleaned:
        vector.append(1)
    else:
        vector.append(0)

In [15]:
vector

[0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1]

In [16]:
reviews_cleaned = [review_1_cleaned, review_2_cleaned]

### Create word vectors for two reviews.

In [17]:
vectors=[]

for i in range(2):
    
    vector = []
    
    for w in attribute_words:
        if w in reviews_cleaned[i]:
            vector.append(1)
        else:
            vector.append(0)
   
    vectors.append(vector)

In [18]:
vectors

[[0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1],
 [1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0]]

### Show common ones (words) in both vectors

In [19]:
for i in range(len(vectors)-1):
    
    for j in range(len(vectors[i])):
        
        if vectors[i][j] == vectors[i+1][j]:
            print(j)
            print(attribute_words[j])

3
recommended
6
blender
7
highly
15
blends
20
results


## Summary:

- Notice that we only have 1s and 0s in this example.
- If we consider word ducplicates with longer reviews, more than 1 value is the word frequency.