In [1]:
# We use the Natural Language Toolkit Library (NLTK) to look at individual words and
# sentences in a text, and to clean unnecessary features from the text data in
# order to prepare for Sentiment Analysis. Text Analysis is this. (Yoda)

# The NLTK library was built to separate punctuation from words
# when tokenizing (splitting into parts).

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# this is sample data
from nltk.corpus import names

from string import punctuation

In [11]:
# the comma is now its own token when the sentence is split
# the sentence is broken down into its most meaningful word parts

text3 = "My best girls are Vanessa, Kayla, and Maude!"
word_tokenize(text3)

['My',
 'best',
 'girls',
 'are',
 'Vanessa',
 ',',
 'Kayla',
 ',',
 'and',
 'Maude',
 '!']

In [12]:
# Multi-sentence texts can be tokenized by sentence. 
# Each sentence is an item in the list.

text4 = "Mona, Erma, and Hefe are done. They've eaten the beans, rice, and gravy!"

sent_tokenize(text4)

['Mona, Erma, and Hefe are done.', "They've eaten the beans, rice, and gravy!"]

In [13]:
# How do we deal with tweets and common social media signs?
# Watch how word_tokenize() butchers this...

tweet = "@phattygirlz OMG u ODB U R so #stupid ;>"

word_tokenize(tweet)

['@',
 'phattygirlz',
 'OMG',
 'u',
 'ODB',
 'U',
 'R',
 'so',
 '#',
 'stupid',
 ';',
 '>']

In [14]:
# Instead, let's keep the @ sign, # sign, and 
# [Note: It missed here.] the wink-at-you emoji...

TweetTokenizer().tokenize(tweet)

['@phattygirlz', 'OMG', 'u', 'ODB', 'U', 'R', 'so', '#stupid', ';', '>']

In [7]:
#list of english stopwords
# stopwords are also filler words

eng_stopwords = stopwords.words('english')
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
# SENTIMENT ANALYSIS
# In order to understand how people feel about something, we need to do sentiment analysis
# on text data that contains their opinion.

# VALENCE AWARE DICTIONARY and SENTIMENT REASONER or V.A.D.E.R.

# The VADER Sentiment Intensity Analyzer returns a score between -1 and 1.
# Scores closer to -1 have a negative sentiment, scores closer to 1 have
# a positive sentiment, and scores around 0 are considered neutral.

In [16]:
# initialize function to do sentiment analysis

sid = SentimentIntensityAnalyzer()

In [17]:
# make a variable of the phrase as a string

goofyquote = "He was decent in college, but he sure stinks in the Pros"

In [18]:
# the polarity score function returns a dictionary, and
# neg, neu, and pos are the ratio of the number of words that are 
# negative (neg), neutral (neu), and positive (pos).

# compound is the sentiment score, which is a given text's polarity

sid.polarity_scores(goofyquote)

{'neg': 0.162, 'neu': 0.647, 'pos': 0.191, 'compound': 0.1154}

In [19]:
# extract the sentiment value from the dictionary of scores

sid.polarity_scores(goofyquote)['compound']

0.1154

In [20]:
vd_comp = sid.polarity_scores(goofyquote)['compound']
type(vd_comp)

float

In [21]:
# Make a Sentiment Value Column in a DataFrame

# Using 'women_clothing_review.csv' dataset, we add a New Column to the
# dataset that will have a Numerical Value for the sentiment of each review.

In [22]:
import pandas as pd

# load the data from the women_clothing_review.csv file

filepath = r"women_clothing_review.csv"
df = pd.read_csv(filepath, encoding = "latin-1") # this file is encoded differently

df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [51]:
# Where are the columns or fields whose values are EMPTY?
# Let's Remove the Missing Values from the "Review Text" column.

df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
review_sentiment              0
dtype: int64

In [52]:
# Keep the DataFrame with valid entries in the same variable.

df.dropna(inplace=True)

In [53]:
df.isnull().sum()

Unnamed: 0                 0
Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
review_sentiment           0
dtype: int64

In [54]:
# Create a function to clean up each review
# then it will anlayze and assign a sentiment polarity

def reviewSentiment(review):
    
    # make text lowercase
    review = str(review).lower()
    
    # tokenize the review
    tknz_review = word_tokenize(review)
    
    # remove punctuation
    for token in tknz_review:
        if token in punctuation:
            tknz_review.remove(token)
            
    # empty list to hold "cleaned" tokens
    clean_tokens = []
    
    # remove filler words
    for token in tknz_review:
        if token not in eng_stopwords:
            clean_tokens.append(token)
            
    # put sentence back together with remaining clean words
    clean_review = ' '.join(clean_tokens)
    
    # get the polarity scores dictionary
    sid_rev = sid.polarity_scores(clean_review)
    
    # get sentiment polarity from the "compound" key in the sid_rev dictionary
    r_comp = sid_rev['compound']
    
    # return the sentiment value
    return r_comp

In [55]:
# creat a new column to hold sentiment value from function


df['review_sentiment'] = df['Review Text'].apply(reviewSentiment)

In [56]:
# Verify sentiment values in new column

df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,review_sentiment
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.9062
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.9464
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.9117
5,5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses,0.9153
6,6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits,0.6361
