# Chapter 24 - Project: Mining product reviews

In [2]:
import pandas as pd

#### Data set

In [4]:
# Read Dataframe 'reviews.csv'

url = ("https://raw.githubusercontent.com/pythonforaccounting/workspace/refs/heads/main/P2%20-%20Working%20with%20tables/project_data/reviews.csv")
reviews_df = pd.read_csv(url)

reviews_df.head()

Unnamed: 0,ProductID,Product Name,Rating,Review,Date
0,K&D/WMF-26982,"WMF Manaos / Bistro Ice Cream Spoon, Set of 4",5,I love it,43660
1,K&D/CUI-82621,Cuisinart DCC-2600 Brew Central 14-Cup Program...,5,Worth the money,42600
2,K&D/HAM-06147,Hamilton Beach 22708 Toastation 2-Slice Toaste...,5,toaster/oven,43738
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,Nice Shaker Set.,44118
4,K&D/OST-12291,"Oster TSSTTVVG01 4-Slice Toaster Oven, Black",5,PRETTY & EASY TO USE,43758


In [5]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73907 entries, 0 to 73906
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     73907 non-null  object
 1   Product Name  73907 non-null  object
 2   Rating        73907 non-null  int64 
 3   Review        73907 non-null  object
 4   Date          73907 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.8+ MB


### Keeping recent reviews

In [7]:
reviews_df['Date']

0        43660
1        42600
2        43738
3        44118
4        43758
         ...  
73902    42920
73903    43639
73904    43685
73905    43888
73906    43961
Name: Date, Length: 73907, dtype: int64

In [8]:
pd.to_datetime(reviews_df['Date'], unit='D', origin='1899-12-30')

0       2019-07-14
1       2016-08-18
2       2019-09-30
3       2020-10-14
4       2019-10-20
           ...    
73902   2017-07-04
73903   2019-06-23
73904   2019-08-08
73905   2020-02-27
73906   2020-05-10
Name: Date, Length: 73907, dtype: datetime64[ns]

In [9]:
reviews_df['Date'] = pd.to_datetime(reviews_df['Date'], unit='D', origin='1899-12-30')

reviews_df.head()

Unnamed: 0,ProductID,Product Name,Rating,Review,Date
0,K&D/WMF-26982,"WMF Manaos / Bistro Ice Cream Spoon, Set of 4",5,I love it,2019-07-14
1,K&D/CUI-82621,Cuisinart DCC-2600 Brew Central 14-Cup Program...,5,Worth the money,2016-08-18
2,K&D/HAM-06147,Hamilton Beach 22708 Toastation 2-Slice Toaste...,5,toaster/oven,2019-09-30
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,Nice Shaker Set.,2020-10-14
4,K&D/OST-12291,"Oster TSSTTVVG01 4-Slice Toaster Oven, Black",5,PRETTY & EASY TO USE,2019-10-20


One option is to extract the year from each date, then count how many times each year appears in the data:

In [11]:
reviews_df['Date'].dt.year.value_counts().sort_index()

Date
2006        6
2007       28
2008       33
2009       86
2010       80
2011      151
2012      322
2013      794
2014     1285
2015     1722
2016     2680
2017     4483
2018     7229
2019    20279
2020    33308
2021     1421
Name: count, dtype: int64

Let’s filter `reviews_df` and keep reviews from 2020 or later only:

In [13]:
reviews_df = reviews_df[reviews_df['Date'].dt.year >= 2020]

reviews_df

Unnamed: 0,ProductID,Product Name,Rating,Review,Date
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,Nice Shaker Set.,2020-10-14
7,K&D/PRO-00930,Progressive International GMMC-68 Microwavable...,5,Works great,2020-09-08
8,K&D/HIG-61873,Highwin P1001-8 8-Cup Doublewall Stainless Ste...,5,Loved it :),2020-12-11
9,K&D/BAR-93217,Baratza Encore - Conical Burr Coffee Grinder (...,5,Great deal,2020-04-30
12,K&D/BAM-78441,Bamboo Cutting Board Set - 3 Piece All In One ...,5,I'm oiling them now. Nice! Lavish and durable.,2020-10-15
...,...,...,...,...,...
73895,K&D/TRU-30514,"Trudeau Melamine Mixing Bowls, Set of 3",1,Broke easily,2020-11-26
73896,K&D/MOR-18621,Morning Mug (1),1,NOT GOOD,2020-07-15
73900,K&D/NIF-37634,Nifty Ice Cream Cone Cupcake Baking Rack,4,Works well,2020-02-11
73905,K&D/CUI-32446,Cuisinart CPT-142 Compact 4-Slice Toaster,2,Wildly Inaccurate Settings,2020-02-27


### Processing review text

Making text uniform typically involves the following steps:

    Converting text to lowercase only;
    I Removing trailing whitespace;
    I Removing punctuation characters;
    I Splitting text into its constituent words;
    I Removing words that don’t carry meaning (i.e., “a”, “the”, etc.).

In [16]:
reviews_df['Review'] = reviews_df['Review'].str.lower()   # Make lowercase
reviews_df['Review'] = reviews_df['Review'].str.strip()   # Remove trailing whitespace

reviews_df

Unnamed: 0,ProductID,Product Name,Rating,Review,Date
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,nice shaker set.,2020-10-14
7,K&D/PRO-00930,Progressive International GMMC-68 Microwavable...,5,works great,2020-09-08
8,K&D/HIG-61873,Highwin P1001-8 8-Cup Doublewall Stainless Ste...,5,loved it :),2020-12-11
9,K&D/BAR-93217,Baratza Encore - Conical Burr Coffee Grinder (...,5,great deal,2020-04-30
12,K&D/BAM-78441,Bamboo Cutting Board Set - 3 Piece All In One ...,5,i'm oiling them now. nice! lavish and durable.,2020-10-15
...,...,...,...,...,...
73895,K&D/TRU-30514,"Trudeau Melamine Mixing Bowls, Set of 3",1,broke easily,2020-11-26
73896,K&D/MOR-18621,Morning Mug (1),1,not good,2020-07-15
73900,K&D/NIF-37634,Nifty Ice Cream Cone Cupcake Baking Rack,4,works well,2020-02-11
73905,K&D/CUI-32446,Cuisinart CPT-142 Compact 4-Slice Toaster,2,wildly inaccurate settings,2020-02-27


In [17]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
def remove_punctuation(review):
    
    for character in string.punctuation:
        review = review.replace(character, '')
        
    return review

In [19]:
remove_punctuation('Great quality!!! Much nicer than expected -- but expensive.')

'Great quality Much nicer than expected  but expensive'

In [20]:
reviews_df['Review'] = reviews_df['Review'].apply(remove_punctuation)

reviews_df.head()

Unnamed: 0,ProductID,Product Name,Rating,Review,Date
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,nice shaker set,2020-10-14
7,K&D/PRO-00930,Progressive International GMMC-68 Microwavable...,5,works great,2020-09-08
8,K&D/HIG-61873,Highwin P1001-8 8-Cup Doublewall Stainless Ste...,5,loved it,2020-12-11
9,K&D/BAR-93217,Baratza Encore - Conical Burr Coffee Grinder (...,5,great deal,2020-04-30
12,K&D/BAM-78441,Bamboo Cutting Board Set - 3 Piece All In One ...,5,im oiling them now nice lavish and durable,2020-10-15


Now, we need to split each review into a list of words. 

In [22]:
reviews_df['Review Words'] = reviews_df['Review'].str.split()

reviews_df[['Review', 'Review Words']]

Unnamed: 0,Review,Review Words
3,nice shaker set,"[nice, shaker, set]"
7,works great,"[works, great]"
8,loved it,"[loved, it]"
9,great deal,"[great, deal]"
12,im oiling them now nice lavish and durable,"[im, oiling, them, now, nice, lavish, and, dur..."
...,...,...
73895,broke easily,"[broke, easily]"
73896,not good,"[not, good]"
73900,works well,"[works, well]"
73905,wildly inaccurate settings,"[wildly, inaccurate, settings]"


In [23]:
stopwords = pd.read_csv('data/stopwords.csv', header=None)  # Read the CSV file without a header
stopwords = stopwords.squeeze()                             # Convert the DataFrame to a Series
stopwords = list(stopwords)                                 # Convert the Series to a list
stopwords[:5]                                               # Show the 5 first elements

['word', 're', 'put', 'say', 'who']

In [24]:
'it' in stopwords

True

In [25]:
'product' in stopwords

False

In [26]:
def remove_stopwords(word_list):
    return [w for w in word_list if w not in stopwords]

In [27]:
remove_stopwords(['imperfect', 'but', 'so', 'handy'])

['imperfect', 'handy']

In [28]:
reviews_df[['Review', 'Review Words']]

Unnamed: 0,Review,Review Words
3,nice shaker set,"[nice, shaker, set]"
7,works great,"[works, great]"
8,loved it,"[loved, it]"
9,great deal,"[great, deal]"
12,im oiling them now nice lavish and durable,"[im, oiling, them, now, nice, lavish, and, dur..."
...,...,...
73895,broke easily,"[broke, easily]"
73896,not good,"[not, good]"
73900,works well,"[works, well]"
73905,wildly inaccurate settings,"[wildly, inaccurate, settings]"


In [29]:
reviews_df['Review Words'] = reviews_df['Review Words'].apply(remove_stopwords)

reviews_df[['Review', 'Review Words']]

Unnamed: 0,Review,Review Words
3,nice shaker set,"[nice, shaker, set]"
7,works great,"[works, great]"
8,loved it,[loved]
9,great deal,"[great, deal]"
12,im oiling them now nice lavish and durable,"[oiling, nice, lavish, durable]"
...,...,...
73895,broke easily,"[broke, easily]"
73896,not good,[good]
73900,works well,[works]
73905,wildly inaccurate settings,"[wildly, inaccurate, settings]"


### Counting words

In [31]:
reviews_df.head()

Unnamed: 0,ProductID,Product Name,Rating,Review,Date,Review Words
3,K&D/FOX-77328,"Fox Run Salt and Pepper Shaker Set, Glass",5,nice shaker set,2020-10-14,"[nice, shaker, set]"
7,K&D/PRO-00930,Progressive International GMMC-68 Microwavable...,5,works great,2020-09-08,"[works, great]"
8,K&D/HIG-61873,Highwin P1001-8 8-Cup Doublewall Stainless Ste...,5,loved it,2020-12-11,[loved]
9,K&D/BAR-93217,Baratza Encore - Conical Burr Coffee Grinder (...,5,great deal,2020-04-30,"[great, deal]"
12,K&D/BAM-78441,Bamboo Cutting Board Set - 3 Piece All In One ...,5,im oiling them now nice lavish and durable,2020-10-15,"[oiling, nice, lavish, durable]"


In [32]:
haters_reviews_df = reviews_df.loc[reviews_df['Rating'] == 1]
lovers_reviews_df = reviews_df.loc[(reviews_df['Rating'] == 5)]

haters_reviews_df.head()

Unnamed: 0,ProductID,Product Name,Rating,Review,Date,Review Words
28320,K&D/AER-36021,Aero Knife,1,dont fall for it,2020-06-13,[fall]
28340,K&D/BLA-83109,Black &amp; Decker 3-in-1 Waffle Maker &amp; I...,1,not what they used to be,2020-07-07,[]
28345,K&D/BAL-09899,Ball Dissolvable Labels - (Set Of 60) (by Jar...,1,look great but dont seal down well on jar,2020-01-01,"[look, great, seal, jar]"
28358,K&D/OST-79825,Oster 2500 Inspire 240-Watt 5-Speed Hand Mixer...,1,didnt work the first time i used it,2020-07-11,"[didnt, work, time]"
28368,K&D/KOL-54568,"Kolder Original Mini Measure, Multi-Purpose Me...",1,markings quickly disappear,2020-07-01,"[markings, quickly, disappear]"


In [33]:
# Sum all words in the 'Review Words' column for haters' reviews and convert to a pandas Series
negative_words = pd.Series(haters_reviews_df['Review Words'].sum()) 

# Sum all words in the 'Review Words' column for lovers' reviews and convert to a pandas Series
positive_words = pd.Series(lovers_reviews_df['Review Words'].sum())  

negative_words.head(10)

0        fall
1        look
2       great
3        seal
4         jar
5       didnt
6        work
7        time
8    markings
9     quickly
dtype: object

In [34]:
# Count the frequency of each unique word in the 'negative_words' series
negative_words.value_counts()

money           96
work            91
buy             85
junk            83
use             82
                ..
human            1
loses            1
cannister        1
compartment      1
terribledont     1
Name: count, Length: 2043, dtype: int64

In [35]:
# Count the frequency of each unique word in the 'positive_words' series
positive_words.value_counts()

great         2721
love          1091
good           727
works          725
perfect        703
              ... 
fingerhand       1
cubed            1
mothera          1
jarjar           1
cutest           1
Name: count, Length: 4787, dtype: int64

For instance, you can try to find out what’s so 'great' about the
products that people review. To do that, you can use a simple
regular expression and extract all words that follow 'great' in
positive reviews.

In [37]:
# Define a regular expression pattern to match phrases starting with 'great' followed by any characters
positive_pattern = '(great .*)'

# Extract and count the occurrences of reviews that match the positive pattern 'great'
lovers_reviews_df['Review'].str.extract(positive_pattern).value_counts()

0                                                
great product                                        159
great quality                                         30
great value                                           29
great price                                           25
great gift                                            23
                                                    ... 
great for pizza                                        1
great for personal use or too entertain or a gift      1
great for people with arthritis                        1
great for parties                                      1
great zucchini noodles                                 1
Name: count, Length: 1352, dtype: int64

In [38]:
# Define a regular expression pattern to match phrases starting with 'poor' followed by any characters
negative_pattern = '(poor .*)'

# Extract and count the occurrences of reviews that match the positive pattern 'great'
haters_reviews_df['Review'].str.extract(negative_pattern).value_counts()

0                                                          
poor quality                                                   22
poor design                                                     4
poor product                                                    3
poor and dangerous workmanship                                  1
poor quality not what i expected                                1
poor quality casting                                            1
poor quality control                                            1
poor quality control warped lid marked lid surface              1
poor quality do not buy                                         1
poor quality dont buy                                           1
poor quality like all kitchenaid products  i have purchased     1
poor quality no longevity                                       1
poor quality poor materials poor design                         1
poor quality baskets                                            1
poor quality pro