<a href="https://colab.research.google.com/github/amkayhani/DSML24/blob/main/3_4_Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

In [1]:
# import packages and stopwords list
import pandas as pd

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

# use English stopwords
stops = set(stopwords.words("english"))

# read the data - replace with your own data
df = pd.read_csv("https://raw.githubusercontent.com/amkayhani/DSML24/main/reviews.csv")

# drop data with missing values in the 'Review' column
df = df.dropna(axis=0, subset=['Review'])
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,Date
1,1,50,Love this under-all cami tank ?,Originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,07/06/2018
2,1,36,Staple tank!,Love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,01/09/2018
3,1,24,Love but also very annoying,"I love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,30/06/2018
4,2,28,"Gorgeous top, straps way too long",I just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,18/07/2018
5,3,36,,I love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,01/07/2018


In [2]:
# convert the relevant column to lowercase
df['Review'] = df.Review.str.lower()
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,Date
1,1,50,Love this under-all cami tank ?,originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,07/06/2018
2,1,36,Staple tank!,love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,01/09/2018
3,1,24,Love but also very annoying,"i love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,30/06/2018
4,2,28,"Gorgeous top, straps way too long",i just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,18/07/2018
5,3,36,,i love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,01/07/2018


In [3]:
# create a list of the reviews from the 'Reviews' column
words = df['Review'].tolist()

# tokenise the words
nltk.download('punkt')
from nltk.tokenize import word_tokenize

word_tokens = []
for review in words:
    word_tokens.append(word_tokenize(review))

# create a new list with stop words removed
filtered_reviews = []
for review in word_tokens:
    filtered_reviews.append([w for w in review if not w in stops])

# add the tokens to the dataframe
df['tokens'] = filtered_reviews
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,Date,tokens
1,1,50,Love this under-all cami tank ?,originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,07/06/2018,"[originally, bought, black, white, ., recently..."
2,1,36,Staple tank!,love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,01/09/2018,"[love, tank, ., material, fit, great, ., love,..."
3,1,24,Love but also very annoying,"i love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,30/06/2018,"[love, tank, ,, comfortable, flaws, ., rolls, ..."
4,2,28,"Gorgeous top, straps way too long",i just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,18/07/2018,"[adore, top, !, comfy, stylish, ., wear, littl..."
5,3,36,,i love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,01/07/2018,"[love, sweater, ., definite, classic, ., get, ..."


In [4]:
# stem the tokens
from nltk.stem import PorterStemmer
ps = PorterStemmer()

stemmed = []
for review in filtered_reviews:
    stemmed.append([ps.stem(w) for w in review])

# add to the dataframe
df['tokens'] = stemmed
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,Date,tokens
1,1,50,Love this under-all cami tank ?,originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,07/06/2018,"[origin, bought, black, white, ., recent, purc..."
2,1,36,Staple tank!,love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,01/09/2018,"[love, tank, ., materi, fit, great, ., love, '..."
3,1,24,Love but also very annoying,"i love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,30/06/2018,"[love, tank, ,, comfort, flaw, ., roll, everi,..."
4,2,28,"Gorgeous top, straps way too long",i just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,18/07/2018,"[ador, top, !, comfi, stylish, ., wear, littl,..."
5,3,36,,i love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,01/07/2018,"[love, sweater, ., definit, classic, ., get, l..."


In [5]:
# put the tokens back together as text
import string
rejoin = []
for review in stemmed:
    x = ",".join(review) # join the text back together
    x = x.replace(",", " ") # replace commas with spaces
    # remove punctuation from the reviews using the string package
    rejoin.append(x.translate(str.maketrans('', '', string.punctuation)))

# add the reformed text to the data frame
df['filtered_review'] = rejoin
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,Date,tokens,filtered_review
1,1,50,Love this under-all cami tank ?,originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,07/06/2018,"[origin, bought, black, white, ., recent, purc...",origin bought black white recent purchas seve...
2,1,36,Staple tank!,love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,01/09/2018,"[love, tank, ., materi, fit, great, ., love, '...",love tank materi fit great love s revers hi...
3,1,24,Love but also very annoying,"i love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,30/06/2018,"[love, tank, ,, comfort, flaw, ., roll, everi,...",love tank comfort flaw roll everi time walk...
4,2,28,"Gorgeous top, straps way too long",i just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,18/07/2018,"[ador, top, !, comfi, stylish, ., wear, littl,...",ador top comfi stylish wear littl purpl card...
5,3,36,,i love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,01/07/2018,"[love, sweater, ., definit, classic, ., get, l...",love sweater definit classic get lot complim...


In [6]:
nltk.download('tagsets')
print(nltk.help.upenn_tagset())

nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
tags = pos_tag(word_tokens[0]) # POS tag the first review (before filtering)
tags

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('originally', 'RB'),
 ('i', 'JJ'),
 ('bought', 'VBD'),
 ('this', 'DT'),
 ('in', 'IN'),
 ('black', 'JJ'),
 ('and', 'CC'),
 ('white', 'JJ'),
 ('.', '.'),
 ('recently', 'RB'),
 ('purchased', 'VBD'),
 ('several', 'JJ'),
 ('more', 'RBR'),
 ('in', 'IN'),
 ('different', 'JJ'),
 ('colorways', 'NNS'),
 ('.', '.'),
 ('why', 'WRB'),
 ('?', '.'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('just', 'RB'),
 ('perfect', 'JJ'),
 ('to', 'TO'),
 ('wear', 'VB'),
 ('as', 'IN'),
 ('an', 'DT'),
 ('under', 'IN'),
 ('tank/cami', 'NN'),
 ('.', '.'),
 ('i', 'VB'),
 ("'m", 'VBP'),
 ('a', 'DT'),
 ('busty', 'JJ'),
 ('gal', 'NN'),
 ('and', 'CC'),
 ('sometimes', 'RB'),
 ('cami', 'JJ'),
 ('straps', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('too', 'RB'),
 ('thin', 'JJ'),
 ('and', 'CC'),
 ('dig', 'NN'),
 ('into', 'IN'),
 ('my', 'PRP$'),
 ('shoulders', 'NNS'),
 ('.', '.'),
 ('this', 'DT'),
 ('is', 'VBZ'),
 ('soft', 'JJ'),
 (',', ','),
 ('smooth', 'JJ'),
 ('and', 'CC'),
 ('provides', 'VBZ'),
 ('just', 'RB'),
 ('enough', 'JJ'),
 (

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# set vectorizer - CountVectorizer for word counts and TfidfVectorizer for TF-IDF
#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

# create an array of word counts / TF-IDF scores
doc_vec = vectorizer.fit_transform(df.Review)

# convert this to a dataframe
df2 = pd.DataFrame(doc_vec.toarray(), columns=vectorizer.get_feature_names_out())


# join the two datasets together
df = df.join(df2, how='left')
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review,Rating,Recommended,PositiveFeedbackCount,Division,Department,Class,...,zippie,zipping,zips,zombie,zone,zoolander,zoom,zooming,zuma,ã¼ber
1,1,50,Love this under-all cami tank ?,originally i bought this in black and white. r...,5,1,0,Initmates,Intimate,Layering,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,36,Staple tank!,love this tank. material and fit are great. lo...,5,1,0,Initmates,Intimate,Layering,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,24,Love but also very annoying,"i love this tank, it is so comfortable but it ...",2,0,1,Initmates,Intimate,Layering,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,28,"Gorgeous top, straps way too long",i just adore this top! it is so comfy and styl...,4,1,0,General,Tops,Knits,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,36,,i love this sweater. definite classic. i get l...,5,1,0,General,Tops,Sweaters,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Find all the bigrams
from nltk import bigrams

string = "I like PSWS and eBF but they are not as good as DSML."
bigrm = list(nltk.bigrams(string.split())) # split the sentence to words
bigrm

[('I', 'like'),
 ('like', 'PSWS'),
 ('PSWS', 'and'),
 ('and', 'eBF'),
 ('eBF', 'but'),
 ('but', 'they'),
 ('they', 'are'),
 ('are', 'not'),
 ('not', 'as'),
 ('as', 'good'),
 ('good', 'as'),
 ('as', 'DSML.')]

In [9]:
# Find the best 'best' bigrams / trigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# create a single string that combines 200 reviews
# words is the list of reviews created earlier
long_string = word_tokenize(",".join(words[0:200]))

# Bigrams
finder = BigramCollocationFinder.from_words(long_string)
# only bigrams that appear >= 2x
finder.apply_freq_filter(2)
# return the 10 bigrams with the highest likelihood
print(finder.nbest(bigram_measures.likelihood_ratio, 10))

[('.', 'i'), ('it', "'s"), ('5', "'"), ('i', "'m"), (',', 'but'), ('a', 'little'), (',', 'i'), ('they', 'are'), ('i', 'am'), ('it', 'is')]


In [10]:
# Trigrams
finder = TrigramCollocationFinder.from_words(long_string)
# only bigrams that appear >= 2x
finder.apply_freq_filter(2)
# return the 10 trigrams with the highest likelihood
print(finder.nbest(trigram_measures.likelihood_ratio, 10))

[('.', 'i', "'m"), ('.', 'i', 'am'), ('.', 'i', 'love'), ('.', 'i', 'ordered'), ('.', 'i', 'bought'), ('.', 'i', "'ve"), ('.', 'i', 'have'), ('.', 'i', 'had'), ('.', 'i', 'was'), ('.', 'i', 'got')]


In [11]:
# import packages
from textblob import TextBlob

# calculate polarity and subjectivity
df['Polarity'] = df['Review'].apply(lambda review: TextBlob(str(review)).sentiment.polarity)
df['Subjectivity'] = df['Review'].apply(lambda review: TextBlob(str(review)).sentiment.subjectivity)

# sort by polarity (low to high)
sorted_df = df.sort_values(by=['Polarity'])

# print top 5 positive and negative
print("Most positive #5 reviews ")
print(sorted_df.Review.tail())
print("\n") # print line break
print("Most negative #5 reviews ")
print(sorted_df.Review.head())

Most positive #5 reviews 
13942    perfect for lunch with the girls, pta, or satu...
5181     this is a great blouse for all shapes!   you c...
19970    this is so flowing and perfect to hide any fla...
17670    i bought this dress as a surprise for my daugh...
13495                      made my wedding outfit perfect!
Name: Review, dtype: object


Most negative #5 reviews 
8437     received this product with a gaping hole in it...
8215     cut out design, no seems or hems.\r\n very dis...
22527    awful color, horribly wrinkled and just a mess...
6938     what a disappointment and for the price, it's ...
12932    the button fell off when i took it out of the ...
Name: Review, dtype: object
