In [115]:
# Imports
import nltk
import numpy as np

In [None]:
# Download WebText Corpus If Not Already Downloaded
nltk.download('webtext')
from nltk.corpus import webtext

## Basic Parsing And Parsing Testing

In [17]:
# Get Raw Data
wine_raw_text = webtext.raw("wine.txt")

In [24]:
# Break Raw Data Into Individual Reviews
wine_reviews_raw = wine_raw_text.split("\n")

print("Number of Reviews: " + str(len(wine_reviews_raw)))
print("Review 1: " + wine_reviews_raw[0])

Number of Reviews: 1348
Review 1: Lovely delicate, fragrant Rhone wine. Polished leather and strawberries. Perhaps a bit dilute, but good for drinking now. ***


## Creating A Bag of Words From Each Review

In [25]:
# Attempt Splitting A Single Review
print(wine_reviews_raw[0].split())

['Lovely', 'delicate,', 'fragrant', 'Rhone', 'wine.', 'Polished', 'leather', 'and', 'strawberries.', 'Perhaps', 'a', 'bit', 'dilute,', 'but', 'good', 'for', 'drinking', 'now.', '***']


Two Issues
1. Capitalization 
2. Punctuation

In [27]:
# Lowercase Conversion
print(wine_reviews_raw[0].lower())

lovely delicate, fragrant rhone wine. polished leather and strawberries. perhaps a bit dilute, but good for drinking now. ***


In [32]:
# Remove Punctuation
translator = str.maketrans('', '', string.punctuation)
print(wine_reviews_raw[0].translate(translator))

Lovely delicate fragrant Rhone wine Polished leather and strawberries Perhaps a bit dilute but good for drinking now 


Note - We will need to remove the score before removing all punctuation

## Create Classification Labels For Reviews

In [96]:
scores = []
for review in wine_reviews_raw:
    
    if "*" in review:
        
        asterisk_count = review.count("*")
        
        if "(*)" in review:
            asterisk_count -= 1
        
        scores.append(asterisk_count)
    else:
        scores.append(0)
        
        
print("Number of scores: " + str(len(scores)))

Number of scores: 1348


In [97]:
# Create Two Level Labels
# 0 If Score Is 2 or Lower (Bad Wine)
# 1 If Score Is 3 Or Higher (Good Wine)

labels_two_level = []
for score in scores:
    if score <= 2:
        labels_two_level.append(0)
    else:
        labels_two_level.append(1)
        
        
print("Number of 0 Labels: " + str(len(list(filter(lambda x: x == 0, labels_two_level)))))
print("Number of 1 Labels: " + str(len(list(filter(lambda x: x == 1, labels_two_level)))))

Number of 0 Labels: 470
Number of 1 Labels: 878


##  Create Bag of Words For Each Classification Label

In [71]:
label_bag_of_words = ["", ""]

for i, review in enumerate(wine_reviews_raw):
    
    if labels_two_level[i] == 0:
        
        # Remove "No Stars" From Reviews With 0 Stars
        if "No Stars" in review:
            review = review.replace('No Stars','')
            
        label_bag_of_words[0] += review.lower().translate(translator) + " "
    else:
        label_bag_of_words[1] += review.lower().translate(translator) + " "
        

## Tokenize Each Bag of Words And Create Frequency Distribution

In [99]:
# Label 0 Tokens and Distribution
label_0_tokens = nltk.tokenize.word_tokenize(label_bag_of_words[0])

freq_dist_0 = nltk.FreqDist(label_0_tokens)
print(freq_dist_0)
print("Top 50 Frequent Words For Label 0:\n" + str(freq_dist_0.most_common(50)))

<FreqDist with 1430 samples and 5800 outcomes>
Top 50 Frequent Words:
[('a', 263), ('and', 192), ('the', 186), ('but', 124), ('of', 118), ('not', 100), ('i', 91), ('it', 80), ('to', 80), ('fruit', 78), ('good', 77), ('bit', 73), ('quite', 71), ('this', 68), ('wine', 63), ('very', 59), ('in', 56), ('with', 53), ('rated', 48), ('top', 48), ('is', 43), ('rather', 42), ('for', 40), ('that', 38), ('on', 36), ('some', 33), ('nice', 33), ('dry', 32), ('touch', 31), ('at', 29), ('be', 29), ('from', 28), ('was', 26), ('more', 25), ('pleasant', 25), ('bottle', 24), ('just', 23), ('an', 23), ('nose', 23), ('slightly', 20), ('wines', 20), ('its', 20), ('than', 19), ('have', 19), ('one', 19), ('so', 18), ('palate', 18), ('pure', 17), ('rich', 17), ('like', 17)]


In [100]:
# Label 1 Tokens and Distribution
label_1_tokens = nltk.tokenize.word_tokenize(label_bag_of_words[1])

freq_dist_1 = nltk.FreqDist(label_1_tokens)
print(freq_dist_1)
print("Top 50 Frequent Words For Label 1:\n" + str(freq_dist_1.most_common(50)))

<FreqDist with 2395 samples and 18058 outcomes>
Top 50 Frequent Words For Label 1:
[('a', 788), ('and', 596), ('the', 558), ('but', 367), ('of', 364), ('very', 321), ('good', 286), ('i', 245), ('it', 236), ('quite', 232), ('to', 228), ('this', 221), ('fruit', 217), ('with', 206), ('wine', 167), ('top', 167), ('in', 163), ('lovely', 159), ('bit', 144), ('is', 138), ('touch', 129), ('bare', 128), ('nose', 127), ('not', 124), ('more', 120), ('on', 117), ('nice', 115), ('for', 115), ('that', 112), ('dry', 110), ('at', 107), ('palate', 97), ('rather', 91), ('fine', 91), ('drinking', 84), ('be', 83), ('still', 79), ('rich', 79), ('from', 79), ('its', 77), ('long', 75), ('than', 74), ('perhaps', 72), ('time', 69), ('some', 66), ('really', 65), ('finish', 64), ('too', 62), ('so', 58), ('balance', 57)]


## Create Dictionary Of All Words

In [102]:
wines_dictionary = set(freq_dist_0.keys()).union(set(freq_dist_1.keys()))
print("Number Of Unique Words: " + str(len(wines_dictionary)))

Number Of Unique Words: 2946


## Remove Top N Frequent Words From Each Label  From The Dictionary

In [103]:
n = 50
label_0_top_50 = {word for word, freq in freq_dist_0.most_common(n)}
label_1_top_50 = {word for word, freq in freq_dist_1.most_common(n)}

print(label_0_top_50)
print(label_1_top_50)

{'the', 'nose', 'rich', 'nice', 'is', 'touch', 'slightly', 'pleasant', 'rated', 'dry', 'bit', 'but', 'very', 'on', 'to', 'of', 'more', 'and', 'with', 'be', 'at', 'fruit', 'one', 'bottle', 'its', 'was', 'an', 'so', 'some', 'this', 'wines', 'wine', 'it', 'from', 'just', 'a', 'have', 'palate', 'good', 'i', 'quite', 'pure', 'rather', 'top', 'for', 'that', 'not', 'in', 'than', 'like'}
{'the', 'long', 'nose', 'rich', 'nice', 'finish', 'still', 'is', 'touch', 'time', 'dry', 'really', 'bit', 'but', 'very', 'on', 'to', 'of', 'more', 'and', 'with', 'be', 'at', 'fruit', 'its', 'so', 'some', 'fine', 'this', 'wine', 'drinking', 'lovely', 'it', 'from', 'perhaps', 'a', 'palate', 'good', 'i', 'quite', 'rather', 'top', 'too', 'for', 'that', 'not', 'balance', 'bare', 'in', 'than'}


In [104]:
wine_dict_top_removed = (wines_dictionary-label_0_top_50)-label_1_top_50
print("Number Of Unique Words After Top N From Each Label Removed: " + str(len(wine_dict_top_removed)))

Number Of Unique Words After Top N From Each Label Removed: 2884


In [106]:
for word_to_remove in label_0_top_50.union(label_1_top_50):
    del freq_dist_0[word_to_remove]
    del freq_dist_1[word_to_remove]
    
print(freq_dist_0)
print(freq_dist_1)

<FreqDist with 1368 samples and 2958 outcomes>
<FreqDist with 2333 samples and 9077 outcomes>


## Final Naive Bayes Data Format

This is what im currently expecting as input to the classifier.

In [107]:
wine_data = {}
wine_data[0] = freq_dist_0
wine_data[1] = freq_dist_1
win_dict = wine_dict_top_removed

In [112]:
from functools import reduce
dictionary = reduce(lambda key_a, key_b: set(wine_data[key_a].keys()).union(set(wine_data[key_b].keys())), wine_data)
print(len(dictionary))

2884


# Get Vectorized Data

In [130]:
# Hold Vectors
data_as_vectors = np.zeros((len(wine_reviews_raw), len(dictionary)))

In [131]:
# Create map
word_map = {}
for i, key in enumerate(dictionary):
    word_map[key] = i
    
print(word_map)

{'oxidative': 0, 'warmclimatelike': 3, 'bananas': 4, 'wellstructured': 6, 'plus': 7, 'everybody': 9, 'insight': 10, 'somewhat': 1, 'carry': 11, 'probably': 12, 'floury': 13, 'except': 14, 'joyfully': 501, 'article': 15, 'fragile': 16, '98': 18, 'corky': 19, 'side': 20, 'group': 21, 'pleasnat': 2, 'contrived': 22, 'etheriel': 23, 'lazy': 24, 'handled': 25, 'leaner': 983, 'car': 28, 'always': 27, 'glimpse': 29, 'june': 2791, 'syrahlike': 31, 'oddly': 502, 'silkytextured': 32, '939495': 34, 'hours': 36, 'chapel': 35, 'spring': 37, 'rancio': 38, 'surfing': 39, 'expression': 8, 'recommendable': 40, 'floral': 41, 'middle': 43, 'supple': 44, 'syrah': 1059, 'berryish': 45, 'bbq': 46, 'none': 47, 'infants': 48, 'oldvine': 49, 'honeysuckle': 50, 'fuit': 51, 'warms': 52, 'corruption': 53, 'game': 54, 'citric': 55, 'cigary': 56, 'stages': 57, 'dull': 1336, 'greenrimmed': 506, 'hour': 58, 'sherberty': 59, 'lightmedium': 60, 'dark': 61, 'nicely': 63, 'condition': 64, 'worked': 65, 'tokaji': 68, 'ove

In [132]:
for i, review in enumerate(wine_reviews_raw):
    
    if labels_two_level[i] == 0:
        
        # Remove "No Stars" From Reviews With 0 Stars
        if "No Stars" in review:
            review = review.replace('No Stars','')
            
    words =  nltk.tokenize.word_tokenize(review.lower().translate(translator))
    
    for word in words:
        if word in word_map:
            data_as_vectors[i, word_map[word]] += 1

In [133]:
print(data_as_vectors[0])
print(np.sum(data_as_vectors[0]))


[0. 0. 0. ... 0. 0. 0.]
8.0
