In [16]:
import pandas as pd

orders_reviews_df = pd.read_csv("data/olist_order_reviews_dataset.csv")

orders_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [17]:

print(orders_reviews_df.shape) #check row and column
print('\n')
print(orders_reviews_df.info()) #check dataframe information number of entries, to col names
print('\n')
print(orders_reviews_df.describe()) #check dataframe statitical summary

(100000, 7)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   review_id                100000 non-null  object
 1   order_id                 100000 non-null  object
 2   review_score             100000 non-null  int64 
 3   review_comment_title     11715 non-null   object
 4   review_comment_message   41753 non-null   object
 5   review_creation_date     100000 non-null  object
 6   review_answer_timestamp  100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None


        review_score
count  100000.000000
mean        4.070890
std         1.359663
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000


### Amending the review_creation_date and review_answer_timestamp to datetime format

In [18]:
# amending to date time for column review_creation_date
orders_reviews_df['review_creation_date']=pd.to_datetime(orders_reviews_df['review_creation_date'])

# amending to date time for column review_answer_timestamp
orders_reviews_df['review_answer_timestamp']=pd.to_datetime(orders_reviews_df['review_answer_timestamp'])

print(orders_reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   review_id                100000 non-null  object        
 1   order_id                 100000 non-null  object        
 2   review_score             100000 non-null  int64         
 3   review_comment_title     11715 non-null   object        
 4   review_comment_message   41753 non-null   object        
 5   review_creation_date     100000 non-null  datetime64[ns]
 6   review_answer_timestamp  100000 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB
None


## Initial Discovery
- Total number of rows is 100,000
- No null values found except for the review_comment_title and review_comment_message, which is mostly the comments provided. Will need to scrutinise this 2 columns in later stages with text analysis
- Average rating from users is 4.07 from the ~ 100,000 reviews
- Creation date is the date the review was created
- Assumption: The Answer date timestamp is the reply by the seller can be analysed on how fast they respond
- There is duplicated review_id need to drop if all columns the same as there are 99173 unique review_id. Dropping duplicated rows

### Creating a function called translate_text

In [None]:
from deep_translator import GoogleTranslator
import pandas as pd
import numpy as np

# Function to translate text
def translate_text(text, dest_lang='en'):
    '''
    Function to translate text if it's not null, otherwise return the original text
    parameters: take in text to translate
    destinantion will be to english
    '''
    if pd.notnull(text):
        translation = GoogleTranslator(source='pt', target=dest_lang).translate(text)
        return translation
    else:
        return text   

    
    # # If the input is a list, concatenate the items into a single string
    # if isinstance(text, list):
    #     text = ' '.join(text)
    # return GoogleTranslator(source='pt', target=dest_lang).translate(text)


### Cleaning the dataframe column from special characters like \r \n

In [None]:
import re

# Function to remove special characters
def remove_special_characters(text):
    if pd.notnull(text):
        cleaned_text=text.replace('\r', '').replace('\n', '')
        return cleaned_text
    else:
        return text   
    
    
# Identify special characters in the 'Text' column
orders_reviews_df['cleaned_review_comment'] = orders_reviews_df['review_comment_message'].apply(remove_special_characters)

orders_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,cleaned_review_comment
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,Recebi bem antes do prazo estipulado.
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,Parabéns lojas lannister adorei comprar pela I...


### Since the dataset is 41753 rows occured alot of premature termination therefore doing translation by each row

In [None]:
print(len(orders_reviews_df))

100000


### Translate from row 0 till 15900

In [None]:
import time
batch_size=15000
total_rows = len(orders_reviews_df)

for idx, row in orders_reviews_df.iterrows():
    try:
        if idx < batch_size:
            # translate the text of each row of cleaned_review_comment
            translated_comment = translate_text(row['cleaned_review_comment'])
            
            # Update the DataFrame with translated texts
            orders_reviews_df.loc[idx, 'translated_review_comment'] = translated_comment
            
            #----------------------------------   
            #translate the review_comment_title
            translated_title = translate_text(row['review_comment_title'])
            # Update the DataFrame with translated texts
            orders_reviews_df.loc[idx, 'translated_comment_title'] = translated_title
        else:
            print(f"Translation complete for {idx} rows out of {len(orders_reviews_df)}")
            # time.sleep(60) # wait for 1mins before proceeding the next 5000
            batch_size += 5000
    except IndexError:
        print("Warning: Occuring out of bounds error")
        pass

print("All Translation complete")

orders_reviews_df.head()

Translation complete for 15000 rows out of 100000
Translation complete for 20000 rows out of 100000
Translation complete for 25000 rows out of 100000
Translation complete for 30000 rows out of 100000
Translation complete for 35000 rows out of 100000
Translation complete for 40000 rows out of 100000
Translation complete for 45000 rows out of 100000
Translation complete for 50000 rows out of 100000
Translation complete for 55000 rows out of 100000
Translation complete for 60000 rows out of 100000
Translation complete for 65000 rows out of 100000
Translation complete for 70000 rows out of 100000
Translation complete for 75000 rows out of 100000
Translation complete for 80000 rows out of 100000
Translation complete for 85000 rows out of 100000
Translation complete for 90000 rows out of 100000
Translation complete for 95000 rows out of 100000
All Translation complete


Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,cleaned_review_comment,translated_review_comment,translated_comment_title
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,,,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,,,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,,,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,Recebi bem antes do prazo estipulado.,I received it well before the stipulated deadl...,
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,Parabéns lojas lannister adorei comprar pela I...,Congratulations lannister stores I loved shopp...,


### translating from row 15900 manage to translate till 95900 row

In [None]:
# batch_size=20900
# num_rows=len(orders_reviews_df)
# for idx in range(15900, num_rows+1):
#     if idx < batch_size:
#         # translate the text of each row of cleaned_review_comment
#         translated_comment = translate_text(orders_reviews_df.iloc[idx]['cleaned_review_comment'])

#         # Update the DataFrame with translated texts
#         orders_reviews_df.loc[idx, 'translated_review_comment'] = translated_comment

#         #----------------------------------   
#         #translate the review_comment_title
#         translated_title = translate_text(row['review_comment_title'])

#         # Update the DataFrame with translated texts
#         orders_reviews_df.loc[idx, 'translated_comment_title'] = translated_title
#     else:
#         print(f"Translation complete for {idx} rows out of {len(orders_reviews_df)}")
#         time.sleep(60) # wait for 3mins before proceeding the next 1000
#         batch_size += 5000


In [None]:
# orders_reviews_df.iloc[95890],['translated_review_comment']

translating from 95900 till 100000

In [None]:
# for idx in range(95900, num_rows+1):
#         # translate the text of each row of cleaned_review_comment
#         translated_comment = translate_text(orders_reviews_df.iloc[idx]['cleaned_review_comment'])

#         # Update the DataFrame with translated texts
#         orders_reviews_df.loc[idx, 'translated_review_comment'] = translated_comment

#         #----------------------------------   
#         #translate the review_comment_title
#         translated_title = translate_text(row['review_comment_title'])

#         # Update the DataFrame with translated texts
#         orders_reviews_df.loc[idx, 'translated_comment_title'] = translated_title
#         print(f"Translation complete for {idx} rows out of {len(orders_reviews_df)}")

# print(f"Completed translation")

In [49]:
orders_reviews_df.tail()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
99995,f3897127253a9592a73be9bdfdf4ed7a,22ec9f0669f784db00fa86d035cf8602,5,,,2017-12-09,2017-12-11 20:06:42
99996,b3de70c89b1510c4cd3d0649fd302472,55d4004744368f5571d1f590031933e4,5,,"Excelente mochila, entrega super rápida. Super...",2018-03-22,2018-03-23 09:10:43
99997,1adeb9d84d72fe4e337617733eb85149,7725825d039fc1f0ceb7635e3f7d9206,4,,,2018-07-01,2018-07-02 12:59:13
99998,be360f18f5df1e0541061c87021e6d93,f8bd3f2000c28c5342fedeb5e50f2e75,1,,Solicitei a compra de uma capa de retrovisor c...,2017-12-15,2017-12-16 01:29:43
99999,efe49f1d6f951dd88b51e6ccd4cc548f,90531360ecb1eec2a1fbb265a0db0508,1,,"meu produto chegou e ja tenho que devolver, po...",2017-07-03,2017-07-03 21:01:49


## Uploading to Postgresql

### Since we are only focussing on review comment message then we are dropping the rest and store it into a new dataframe

In [None]:
#drop the null values that are in review_comment_message and reset index and drop the old index column
text_analysis_df = orders_reviews_df.dropna(subset=['translated_review_comment']).reset_index(drop=True)

#text_analysis_df.head()

text_analysis_df = text_analysis_df.loc[:,["review_id","order_id", "review_score", "translated_comment_title" ,"translated_review_comment", "review_creation_date"]]

text_analysis_df.head()

#orders_reviews_df_no_nulls.info()

Unnamed: 0,review_id,order_id,review_score,translated_comment_title,translated_review_comment,review_creation_date
0,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,I received it well before the stipulated deadl...,2017-04-21
1,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Congratulations lannister stores I loved shopp...,2018-03-01
2,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,I recommend,efficient device. On the website the brand of ...,2018-05-22
3,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"But a little slow...for the price, it's good.",2018-02-16
4,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,I highly recommend,"Reliable seller, ok product and delivery on time.",2018-05-23


## Sentiment Analysis

In [None]:
from textblob import TextBlob

# Function to perform sentiment analysis
def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the 'Comment' column
text_analysis_df['Sentiment'] = text_analysis_df['translated_review_comment'].apply(analyze_sentiment)


In [None]:
text_analysis_df.head(20)

Unnamed: 0,review_id,order_id,review_score,translated_comment_title,translated_review_comment,review_creation_date,Sentiment
0,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,I received it well before the stipulated deadl...,2017-04-21,Neutral
1,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Congratulations lannister stores I loved shopp...,2018-03-01,Positive
2,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,I recommend,efficient device. On the website the brand of ...,2018-05-22,Neutral
3,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"But a little slow...for the price, it's good.",2018-02-16,Positive
4,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,I highly recommend,"Reliable seller, ok product and delivery on time.",2018-05-23,Positive
5,9314d6f9799f5bfba510cc7bcd468c01,0dacf04c5ad59fd5a0cc1faa07c34e39,2,,"I WOULD LIKE TO KNOW WHAT HAPPENED, I ALWAYS R...",2018-01-18,Negative
6,373cbeecea8286a2b66c97b1b157ec46,583174fbe37d3d5f0d6661be3aad1786,1,My product didn't arrive,Terrible,2018-08-15,Negative
7,d21bbc789670eab777d27372ab9094cc,4fc44d78867142c627497b60a7e0228a,5,Excellent,Store note 10,2018-07-10,Neutral
8,0e0190b9db53b689b285d3f3916f8441,79832b7cb59ac6f887088ffd686e1d5e,5,,thank you for the attention you gave me,2017-12-01,Neutral
9,fe3db7c069d694bab50cc43463f91608,2ca73e2ff9e3a186ad1e1ffb9b1d9c10,5,,The purchase was made easily. Delivery was mad...,2018-03-23,Positive


or

## Keyword Extraction

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from collections import Counter

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')

# Function to extract keywords from the text
def extract_keywords(text, num_keywords=5):
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Removing stopwords removing words that is non-aphabetical as well as words that exist in stop words like i.e the
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Frequency count
    word_freq = nltk.FreqDist(tokens)

    # Create dictionary
    word_dict = {word: idx for idx, (word, _) in enumerate(word_freq.items())}

    return word_dict

    # # keyword_counter = Counter(filtered_tokens)
    # return keyword_counter.most_common(num_keywords)

# Apply keyword extraction to the 'Comment' column
text_analysis_df['Keywords'] = text_analysis_df['translated_review_comment'].apply(extract_keywords)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T470\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T470\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
text_analysis_df.head(10)

Unnamed: 0,review_id,order_id,review_score,translated_comment_title,translated_review_comment,review_creation_date,Sentiment,Keywords
0,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,I received it well before the stipulated deadl...,2017-04-21,Neutral,"{'i': 0, 'receiv': 1, 'it': 2, 'well': 3, 'bef..."
1,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Congratulations lannister stores I loved shopp...,2018-03-01,Positive,"{'congratul': 0, 'lannist': 1, 'store': 2, 'i'..."
2,8670d52e15e00043ae7de4c01cc2fe06,b9bf720beb4ab3728760088589c62129,4,I recommend,efficient device. On the website the brand of ...,2018-05-22,Neutral,"{'effici': 0, 'devic': 1, '.': 2, 'on': 3, 'th..."
3,4b49719c8a200003f700d3d986ea1a19,9d6f15f95d01e79bd1349cc208361f09,4,,"But a little slow...for the price, it's good.",2018-02-16,Positive,"{'but': 0, 'a': 1, 'littl': 2, 'slow': 3, '......"
4,3948b09f7c818e2d86c9a546758b2335,e51478e7e277a83743b6f9991dbfa3fb,5,I highly recommend,"Reliable seller, ok product and delivery on time.",2018-05-23,Positive,"{'reliabl': 0, 'seller': 1, ',': 2, 'ok': 3, '..."
5,9314d6f9799f5bfba510cc7bcd468c01,0dacf04c5ad59fd5a0cc1faa07c34e39,2,,"I WOULD LIKE TO KNOW WHAT HAPPENED, I ALWAYS R...",2018-01-18,Negative,"{'i': 0, 'would': 1, 'like': 2, 'to': 3, 'know..."
6,373cbeecea8286a2b66c97b1b157ec46,583174fbe37d3d5f0d6661be3aad1786,1,My product didn't arrive,Terrible,2018-08-15,Negative,{'terribl': 0}
7,d21bbc789670eab777d27372ab9094cc,4fc44d78867142c627497b60a7e0228a,5,Excellent,Store note 10,2018-07-10,Neutral,"{'store': 0, 'note': 1, '10': 2}"
8,0e0190b9db53b689b285d3f3916f8441,79832b7cb59ac6f887088ffd686e1d5e,5,,thank you for the attention you gave me,2017-12-01,Neutral,"{'thank': 0, 'you': 1, 'for': 2, 'the': 3, 'at..."
9,fe3db7c069d694bab50cc43463f91608,2ca73e2ff9e3a186ad1e1ffb9b1d9c10,5,,The purchase was made easily. Delivery was mad...,2018-03-23,Positive,"{'the': 0, 'purchas': 1, 'wa': 2, 'made': 3, '..."


# wordcloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def visualize_word_cloud(text):
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    
    # Display the generated image:
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

# Visualize word cloud
visualize_word_cloud(text_analysis_df['Keywords'])

TypeError: expected string or bytes-like object, got 'Series'

## Topic Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to perform topic modeling using LDA
def perform_topic_modeling(texts, num_topics=5):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(texts)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(X)
    return lda

# Apply topic modeling to the 'Comment' column
lda_model = perform_topic_modeling(text_analysis_df['translated_review_comment'])
