In [50]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
nltk.download('punkt')
import unicodedata
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeonsubeen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [51]:
data = pd.read_csv('processed_data.csv',encoding ='utf-8')
data.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,payment_sequential,payment_type,...,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,1,credit_card,...,29.99,8.72,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,1,boleto,...,118.7,22.76,perfumaria,29.0,178.0,1.0,400.0,19.0,13.0,19.0
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,1,credit_card,...,159.9,19.22,automotivo,46.0,232.0,1.0,420.0,24.0,19.0,21.0
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,1,credit_card,...,45.0,27.2,pet_shop,59.0,468.0,3.0,450.0,30.0,10.0,20.0
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,1,credit_card,...,19.9,8.72,papelaria,38.0,316.0,4.0,250.0,51.0,15.0,15.0


In [52]:
data[data['review_comment_message'] == 'O cliente não comentou'].shape

(0, 36)

In portuguese, O cliente não comentou means the customer didn't comment, so let's try to make new dataframe which only consists commented orders.

In [53]:
data['comment_present'] = data.review_comment_message != 'O cliente não comentou'
orders_commented = data[data['comment_present'] == True]
orders_commented = orders_commented[(orders_commented['review_score'] == 1) | (orders_commented['review_score'] == 5)]

In [54]:
orders_commented.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66190 entries, 2 to 95977
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   order_id                       66190 non-null  object 
 1   customer_id                    66190 non-null  object 
 2   order_status                   66190 non-null  object 
 3   order_purchase_timestamp       66190 non-null  object 
 4   order_approved_at              66190 non-null  object 
 5   order_delivered_carrier_date   66190 non-null  object 
 6   order_delivered_customer_date  66190 non-null  object 
 7   order_estimated_delivery_date  66190 non-null  object 
 8   payment_sequential             66190 non-null  int64  
 9   payment_type                   66190 non-null  object 
 10  payment_installments           66190 non-null  int64  
 11  payment_value                  66190 non-null  float64
 12  customer_unique_id             66190 non-null 

In [55]:
orders_commented = orders_commented.astype({"order_status" : 'category', \
                       "order_purchase_timestamp" : 'datetime64[ns]', "order_approved_at" : 'datetime64[ns]', \
                       "order_delivered_carrier_date" : 'datetime64[ns]', "order_delivered_customer_date" : 'datetime64[ns]', \
                       "order_estimated_delivery_date" : 'datetime64[ns]', 'review_creation_date':'datetime64[ns]', \
                        "review_answer_timestamp" : 'datetime64[ns]', "shipping_limit_date" : 'datetime64[ns]', \
                        "product_category_name" : 'category', 
                       })

Now, we want to analyse the review comment

In [56]:
# Source
#https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
#https://en.wikipedia.org/wiki/Unicode_equivalence
def change_str(sentence):
    return str(sentence)

def normalize_form(sentence):
    return unicodedata.normalize('NFKD', sentence).encode('ascii', errors='ignore').decode('utf-8')

stopw = set(normalize_form(w) for w in nltk.corpus.stopwords.words('portuguese'))
stopw.remove('nao') #useful to understand sentiment


def sentence_to_words(sentence):
    normalized = normalize_form(sentence.lower())
    tokens = nltk.tokenize.word_tokenize(normalized)
    words = tuple(t for t in tokens if t not in stopw and t.isalpha())
    return words

def words_to_ngrams(words):
    trigrams = []
    for w in words:
        trigrams.extend(' '.join(trigram) for trigram in nltk.trigrams(w))
    
    return trigrams

In [57]:
orders_commented['review_comment_message']= orders_commented['review_comment_message'].apply(change_str)
orders_commented['review_comment_words'] = orders_commented['review_comment_message'].apply(sentence_to_words)

positive_reviews = orders_commented[orders_commented['review_score'] == 5]
negative_reviews = orders_commented[orders_commented['review_score'] == 1]

positive_trigrams = words_to_ngrams(positive_reviews['review_comment_words'])
negative_trigrams = words_to_ngrams(negative_reviews['review_comment_words'])

In [58]:
top20_positive = dict(Counter(positive_trigrams).most_common(20))
top20_negative = dict(Counter(negative_trigrams).most_common(20))


trigrams = pd.DataFrame(list(zip(top20_positive.keys(), top20_positive.values(), \
                                top20_negative.keys(),top20_negative.values())), \
                                columns =['positive_trigram','positive_count', 'negative_trigram', 'negative_count'])

In [59]:
trigrams

Unnamed: 0,positive_trigram,positive_count,negative_trigram,negative_count
0,chegou antes prazo,915,nao recebi produto,642
1,bem antes prazo,585,ainda nao recebi,313
2,entregue antes prazo,513,produto nao entregue,175
3,entrega antes prazo,368,produto nao chegou,113
4,produto chegou antes,356,produto ainda nao,111
5,chegou bem antes,346,ainda nao chegou,87
6,produto entregue antes,309,momento nao recebi,81
7,entrega super rapida,289,ainda nao entregue,75
8,produto otima qualidade,251,agora nao recebi,71
9,antes prazo previsto,226,produto pessima qualidade,53


try to change to english?

In [87]:
from google_trans_new import google_translator  
  
translator = google_translator()  

for i in tqdm(range(trigrams.shape[0])) :
    trigrams.iloc[i,0] = translator.translate(trigrams.iloc[i,0], lang_tgt='en')
    trigrams.iloc[i,2] = translator.translate(trigrams.iloc[i,2], lang_tgt='en')
    
    
trigrams

100%|██████████| 20/20 [00:02<00:00,  9.19it/s]


Unnamed: 0,positive_trigram,positive_count,negative_trigram,negative_count
0,arrived before deadline,915,I did not receive the product,642
1,well-term,585,I still have not received,313
2,delivered before time,513,product not delivered,175
3,Delivery before time,368,product has not arrived,113
4,product has come before,356,product not yet,111
5,arrived well before,346,it did not arrive yet,87
6,product delivered before,309,moment I did not receive,81
7,Super fast delivery,289,not delivered,75
8,product otima quality,251,now I did not receive,71
9,before expected,226,Product bad quality,53
