# Analysis of Reviews on Olist

🎯 Now that you are familiar with NLP, let's analyze the reviews of Olist.

👇 Run the following cell to load the reviews dataset and install `unidecode`

In [216]:
#!pip install -q unidecode

import pandas as pd

url = "https://wagon-public-datasets.s3.amazonaws.com/Machine%20Learning%20Datasets/ml_olist_nlp_reviews.csv"
df = pd.read_csv(url, low_memory = False)

df.head()

Unnamed: 0.1,Unnamed: 0,review_id,length_review,review_score,order_id,product_category_name,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date
0,0,7bc2406110b926393aa56f80a40eba40,0,4,73fc7af87114b39712e6da79b0a377eb,esporte_lazer,,,2018-01-18 00:00:00,2018-01-18 21:46:59,41dcb106f807e993532d446263290104,delivered,2018-01-11 15:30:49,2018-01-11 15:47:59,2018-01-12 21:57:22,2018-01-17 18:42:41,2018-02-02 00:00:00
1,1,80e641a11e56f04c1ad469d5645fdfde,0,5,a548910a1c6147796b98fdf73dbeba33,informatica_acessorios,,,2018-03-10 00:00:00,2018-03-11 03:05:13,8a2e7ef9053dea531e4dc76bd6d853e6,delivered,2018-02-28 12:25:19,2018-02-28 12:48:39,2018-03-02 19:08:15,2018-03-09 23:17:20,2018-03-14 00:00:00
2,2,228ce5500dc1d8e020d8d1322874b6f0,0,5,f9e4b658b201a9f2ecdecbb34bed034b,informatica_acessorios,,,2018-02-17 00:00:00,2018-02-18 14:36:24,e226dfed6544df5b7b87a48208690feb,delivered,2018-02-03 09:56:22,2018-02-03 10:33:41,2018-02-06 16:18:28,2018-02-16 17:28:48,2018-03-09 00:00:00
3,3,e64fb393e7b32834bb789ff8bb30750e,37,5,658677c97b385a9be170737859d3511b,ferramentas_jardim,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06,de6dff97e5f1ba84a3cd9a3bc97df5f6,delivered,2017-04-09 17:41:13,2017-04-09 17:55:19,2017-04-10 14:24:47,2017-04-20 09:08:35,2017-05-10 00:00:00
4,4,f7c4243c7fe1938f181bec41a392bdeb,100,5,8e6bfb81e283fa7e4f11123a3fb894f1,esporte_lazer,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53,5986b333ca0d44534a156a52a8e33a83,delivered,2018-02-10 10:59:03,2018-02-10 15:48:21,2018-02-15 19:36:14,2018-02-28 16:33:35,2018-03-09 00:00:00


In [217]:
df.shape

(98657, 17)

In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98657 entries, 0 to 98656
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Unnamed: 0                     98657 non-null  int64 
 1   review_id                      98657 non-null  object
 2   length_review                  98657 non-null  int64 
 3   review_score                   98657 non-null  int64 
 4   order_id                       98657 non-null  object
 5   product_category_name          98657 non-null  object
 6   review_comment_title           11486 non-null  object
 7   review_comment_message         40439 non-null  object
 8   review_creation_date           98657 non-null  object
 9   review_answer_timestamp        98657 non-null  object
 10  customer_id                    98657 non-null  object
 11  order_status                   98657 non-null  object
 12  order_purchase_timestamp       98657 non-null  object
 13  o

In [219]:
df.isnull().sum() 

Unnamed: 0                           0
review_id                            0
length_review                        0
review_score                         0
order_id                             0
product_category_name                0
review_comment_title             87171
review_comment_message           58218
review_creation_date                 0
review_answer_timestamp              0
customer_id                          0
order_status                         0
order_purchase_timestamp             0
order_approved_at                   13
order_delivered_carrier_date       985
order_delivered_customer_date     2098
order_estimated_delivery_date        0
dtype: int64

❓ **Question: Analyse the reviews to understand what could be the causes of the bad review scores** ❓

This challenge is not as guided as the previous ones. But here are some questions to ask yourself:

- Are all the reviews relevant ? 
- What about combining the title and the body of a review ?
- What cleaning operations would you apply to the reviews ?

🇧🇷 Some Brazilian expressions and their translations:

- `producto errado` = wrong product
- `ainda nao` = not yet
- `nao entregue` = not delivered
- `nao veio` = did not come
- `nao gostei` = did not like it
- `produto defeito` = defective product
- `nao functiona` = not working
- `produto diferente` = different product
- `pessima qualidade` = poor quality
- `veio defeito` = came defect
- `veio faltando` = came missing
- `veio errado` = came wrong

In [220]:
df_negative = df[df['review_score'] < 3]
df_negative['combined_review'] = df_negative['review_comment_title'].fillna('') + " " + df_negative['review_comment_message'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_negative['combined_review'] = df_negative['review_comment_title'].fillna('') + " " + df_negative['review_comment_message'].fillna('')


In [225]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

In [226]:
def cleaning(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    sentence = ''.join(char for char in sentence if char not in string.punctuation)
    tokenized_sentence = word_tokenize(sentence)
    lemmatized = [WordNetLemmatizer().lemmatize(word, pos='v') for word in tokenized_sentence]
    return ' '.join(lemmatized)
df_negative['cleaned_review'] = df_negative['combined_review'].apply(clean_text)
df_negative[['review_score', 'cleaned_review']].head()

NameError: name 'clean_text' is not defined

In [227]:
df_negative['processed_review'] = df_negative['cleaned_review'].apply(cleaning)
df

KeyError: 'cleaned_review'

In [176]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, train_test_split

In [177]:
X = df['cleaned_review']
y = df['review_score']

# Crear el Pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('naive_bayes', MultinomialNB())
])

param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__max_df': [0.9, 1.0],
    'vectorizer__min_df': [1, 2],
    'naive_bayes__alpha': [0.1, 1.0]
}


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='recall',
    cv=3,
    n_jobs=-1
)

In [179]:
print(y.value_counts())

1    4
2    1
Name: review_score, dtype: int64


In [178]:
grid_search.fit(X, y)



In [180]:
grid_search.best_params_

{'naive_bayes__alpha': 0.1,
 'vectorizer__max_df': 0.9,
 'vectorizer__min_df': 1,
 'vectorizer__ngram_range': (1, 1)}

In [181]:
grid_search.best_score_

1.0

In [182]:
grid_search.best_estimator_

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, max_features=1000)
df_v = vectorizer.fit_transform(df['full_review'])
df_v = pd.DataFrame(
    df_v.toarray(),
    columns=vectorizer.get_feature_names_out()
)
df_v

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

X = df_v
y = df['review_score'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print("Reporte de clasificación:")
print(classification_report(y_test, predictions))

print("Matriz de confusión:")
print(confusion_matrix(y_test, predictions))

🏁 Congratulations. Instead of reading 90K+ reviews, you were able to detect the main reasons of dissatisfactions on Olist.

💾 Don't forget to `git add/commit/push`