In [1]:
import pandas as pd

orders_reviews_df = pd.read_csv("data/olist_order_reviews_dataset.csv")

orders_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [2]:

print(orders_reviews_df.shape) #check row and column
print('\n')
print(orders_reviews_df.info()) #check dataframe information number of entries, to col names
print('\n')
print(orders_reviews_df.describe()) #check dataframe statitical summary

(100000, 7)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   review_id                100000 non-null  object
 1   order_id                 100000 non-null  object
 2   review_score             100000 non-null  int64 
 3   review_comment_title     11715 non-null   object
 4   review_comment_message   41753 non-null   object
 5   review_creation_date     100000 non-null  object
 6   review_answer_timestamp  100000 non-null  object
dtypes: int64(1), object(6)
memory usage: 5.3+ MB
None


        review_score
count  100000.000000
mean        4.070890
std         1.359663
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000


### Amending the review_creation_date and review_answer_timestamp to datetime format

In [3]:
# amending to date time for column review_creation_date
orders_reviews_df['review_creation_date']=pd.to_datetime(orders_reviews_df['review_creation_date'])

# amending to date time for column review_answer_timestamp
orders_reviews_df['review_answer_timestamp']=pd.to_datetime(orders_reviews_df['review_answer_timestamp'])

print(orders_reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   review_id                100000 non-null  object        
 1   order_id                 100000 non-null  object        
 2   review_score             100000 non-null  int64         
 3   review_comment_title     11715 non-null   object        
 4   review_comment_message   41753 non-null   object        
 5   review_creation_date     100000 non-null  datetime64[ns]
 6   review_answer_timestamp  100000 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.3+ MB
None


## Initial Discovery
- Total number of rows is 100,000
- No null values found except for the review_comment_title and review_comment_message, which is mostly the comments provided. Will need to scrutinise this 2 columns in later stages with text analysis
- Average rating from users is 4.07 from the ~ 100,000 reviews
- Creation date is the date the review was created
- Assumption: The Answer date timestamp is the reply by the seller can be analysed on how fast they respond
- There is duplicated review_id need to drop if all columns the same as there are 99173 unique review_id. Dropping duplicated rows

### Creating a function called translate_text

In [4]:
from deep_translator import GoogleTranslator
import pandas as pd
import numpy as np

# Function to translate text
def translate_text(text, dest_lang='en'):
    '''
    Function to translate text if it's not null, otherwise return the original text
    parameters: take in text to translate
    destinantion will be to english
    '''
    if pd.notnull(text):
        translation = GoogleTranslator(source='pt', target=dest_lang).translate(text)
        return translation
    else:
        return text   



### Cleaning the dataframe column from special characters like \r \n

In [5]:
import re

# Function to remove special characters
def remove_special_characters(text):
    if pd.notnull(text):
        cleaned_text=text.replace('\r', '').replace('\n', '')
        return cleaned_text
    else:
        return text   
    
    
# Identify special characters in the 'Text' column
orders_reviews_df['cleaned_review_comment'] = orders_reviews_df['review_comment_message'].apply(remove_special_characters)

orders_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,cleaned_review_comment
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,Recebi bem antes do prazo estipulado.
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,Parabéns lojas lannister adorei comprar pela I...


### Since the dataset is 41753 rows occured alot of premature termination therefore doing translation by each row

### Translate from row 0 till 15900

In [11]:
import time
batch_size=500
total_rows = len(orders_reviews_df)

for idx, row in orders_reviews_df.iterrows():
    try:
        if idx < batch_size:
            # translate the text of each row of cleaned_review_comment
            translated_comment = translate_text(row['cleaned_review_comment'])
            
            # Update the DataFrame with translated texts
            # orders_reviews_df.loc[idx, 'translated_review_comment'] = translated_comment
            orders_reviews_df.at[idx, 'translated_review_comment'] = translated_comment

            #----------------------------------   
            #translate the review_comment_title
            translated_title = translate_text(row['review_comment_title'])
            # Update the DataFrame with translated texts
            orders_reviews_df.loc[idx, 'translated_comment_title'] = translated_title
        else:
            print(f"Translation complete for {idx} rows out of {len(orders_reviews_df)}")
            #time.sleep(30) # wait for 1mins before proceeding the next 5000
            batch_size += 500
    except IndexError:
        print("Warning: Occuring out of bounds error")
        pass

print("All Translation complete")

orders_reviews_df.head()

Translation complete for 500 rows out of 100000
Translation complete for 1000 rows out of 100000
Translation complete for 1500 rows out of 100000


ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [55]:
batch_size=52740               
num_rows=len(orders_reviews_df)
for idx in range(52740 , num_rows+1):
    if idx < batch_size:
        # translate the text of each row of cleaned_review_comment
        translated_comment = translate_text(orders_reviews_df.iloc[idx]['cleaned_review_comment'])

        # Update the DataFrame with translated texts
        orders_reviews_df.loc[idx, 'translated_review_comment'] = translated_comment

        #----------------------------------   
        #translate the review_comment_title
        translated_title = translate_text(row['review_comment_title'])

        # Update the DataFrame with translated texts
        orders_reviews_df.loc[idx, 'translated_comment_title'] = translated_title
    else:
        print(f"Translation complete for {idx} rows out of {len(orders_reviews_df)}")
        #time.sleep(20) # wait for 3mins before proceeding the next 1000
        batch_size += 200


Translation complete for 52740 rows out of 100000
Translation complete for 52940 rows out of 100000
Translation complete for 53140 rows out of 100000
Translation complete for 53340 rows out of 100000
Translation complete for 53540 rows out of 100000
Translation complete for 53740 rows out of 100000
Translation complete for 53940 rows out of 100000
Translation complete for 54140 rows out of 100000
Translation complete for 54340 rows out of 100000
Translation complete for 54540 rows out of 100000
Translation complete for 54740 rows out of 100000
Translation complete for 54940 rows out of 100000
Translation complete for 55140 rows out of 100000
Translation complete for 55340 rows out of 100000
Translation complete for 55540 rows out of 100000
Translation complete for 55740 rows out of 100000
Translation complete for 55940 rows out of 100000
Translation complete for 56140 rows out of 100000
Translation complete for 56340 rows out of 100000
Translation complete for 56540 rows out of 100000


In [None]:
orders_reviews_df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,cleaned_review_comment,translated_review_comment,translated_comment_title
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18,2018-01-18 21:46:59,,,
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10,2018-03-11 03:05:13,,,
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17,2018-02-18 14:36:24,,,
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21,2017-04-21 22:02:06,Recebi bem antes do prazo estipulado.,I received it well before the stipulated deadl...,
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01,2018-03-02 10:26:53,Parabéns lojas lannister adorei comprar pela I...,Congratulations lannister stores I loved shopp...,


## Convert to csv file to be later uploaded to postgresql

In [None]:
# Save DataFrame to a CSV file
orders_reviews_df.to_csv('cleaned_orders_reviews.csv', index=False) 