In [1]:
import pandas as pd
from sklearn.base import BaseEstimator

from data_process import data_process

In [None]:
# reference: https://github.com/CaptainE/RNN-LSTM-in-numpy/blob/master/RNN_LSTM_from_scratch.ipynb


In [2]:
# Access to df
df = data_process()

In [3]:
print(df.shape)

(17658865, 44)


In [4]:
# Start Sentiment Analysis  (on reviews)
df.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,order_id,...,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_zip_code_prefix,seller_city,seller_state,product_category_name_english
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.509897,-47.397866,franca,SP,00e7ee1b050b8499577073aeb2a297a1,...,1141.0,1.0,8683.0,54.0,64.0,31.0,8577.0,itaquaquecetuba,SP,office_furniture
1,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.497396,-47.399241,franca,SP,00e7ee1b050b8499577073aeb2a297a1,...,1141.0,1.0,8683.0,54.0,64.0,31.0,8577.0,itaquaquecetuba,SP,office_furniture
2,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.510459,-47.399553,franca,SP,00e7ee1b050b8499577073aeb2a297a1,...,1141.0,1.0,8683.0,54.0,64.0,31.0,8577.0,itaquaquecetuba,SP,office_furniture
3,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.48094,-47.394161,franca,SP,00e7ee1b050b8499577073aeb2a297a1,...,1141.0,1.0,8683.0,54.0,64.0,31.0,8577.0,itaquaquecetuba,SP,office_furniture
4,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,-20.515413,-47.398194,franca,SP,00e7ee1b050b8499577073aeb2a297a1,...,1141.0,1.0,8683.0,54.0,64.0,31.0,8577.0,itaquaquecetuba,SP,office_furniture


In [5]:
df.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'geolocation_lat', 'geolocation_lng',
       'geolocation_city', 'geolocation_state', 'order_id', 'order_status',
       'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'order_item_id', 'product_id',
       'seller_id', 'shipping_limit_date', 'price', 'freight_value',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'review_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp', 'product_category_name',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 

In [10]:
df_review = df.loc[:,['review_id','review_score','review_comment_title','review_comment_message']]
# remove null data on certain column
df_review = df_review.dropna(subset = ['review_comment_message'])
df_review.columns = ['id','score','comment_title','comment']

print(f'Dataset shape: {df_review.shape}')

Dataset shape: (7277068, 4)


In [1]:
# Option 1: Convert all portuguese into English and conduct processing
from deep_translator import GoogleTranslator
def translate_text(text):
    translated = GoogleTranslator(source='auto', target='en').translate(text)
    return translated

# Option 2: Process text in portuguese



In [None]:
# Pipelines & Custom Transformers in scikit-learn
# Reference : https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
class ExperimentalTransformers(BaseEstimator, TransformerMixin):
    # Constructor, called when pipeline is initialized
    def __init__(self, feature_name, language='en'):
        print('\n')
    
    # Called when we fit the pipeline
    def fit(self,X,y = None):
        print('\n fit() was called')
    
    # Called when we use fit or transform on the pipeline
    def transform(self,X, y = None):
        print('\n transform() was called')
        X_new = X.copy()
        return X_new

In [None]:
print('Text Processing Pipeline started')

In [None]:
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
stopwords_pt = stopwords.word("portuguese")

import re
def remove_stopwords(text,stopwords_pt):
    '''
        Remove stopwords from reviews
    :param texto: 
    :return: 
    '''
    text = text.lower()

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_pt]

    text_clean = (" ".join(filtered_words))
   

    return text_clean

df_review['review'] = df_review['review'].apply(remove_stopwords)

In [None]:
df_review.head()

### Feature Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary = True, max_features= 5000)
texts = df_review['review']

x_bow = vectorizer.fit_transform(texts)
print(x_bow.shape, type(x_bow))

#### TF -- IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features = 5000)
x_tfidf = tfidf_vectorizer.fit_transform(texto)

## Models

In [None]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(x_bow, df_review['label'],
                                                        test_size=0.3, random_state = 10)

X2_train, X2_test, y2_train, y2_test = train_test_split(x_tfidf, df_review['label'],
                                                        test_size=0.3, random_state = 10)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
def matching_criteria(y_true, y_pred):
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("\nROC, AUC:", roc_auc_score(y_true, y_pred))
    print("\nF1-Score:", f1_score(y_true, y_pred, average='weighted'))
    print("\nIts Corresponding Confusion Matrix:")
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True)
    plt.show()

In [None]:

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error

def model_outcome(X_train,y_train, X_test, y_test):
    models = []
    models.append(('Logistic Regression', LogisticRegression()))
    models.append(('Decision Trees', DecisionTreeClassifier()))
    models.append(('Naive Bayes', GaussianNB()))
    models.append(('SVM', SVC()))
    models.append(('Random Forest', RandomForestClassifier()))

    result = pd.DataFrame(columns = ['Model','Accuracy for Training','Accuracy for Testing','Std'])
    for index,model in enumerate(models):
        kfold = StratifiedKFold(n_splits = 5)
        cross_val_result = cross_val_score(model[1], X_train,y_train, cv = kfold)

        # training the model
        model[1].fit(X_train,y_train)
        predictions = model[1].predict(X_test)
        test_accuracy = accuracy_score(y_test, predictions)

        result.loc[index,'Model'] = model[0]
        result.loc[index,'Accuracy for Training'] = cross_val_result.mean()
        result.loc[index,'Accuracy for Testing'] = test_accuracy
        result.loc[index,'Std'] = cross_val_result.std()


    return result
model_result = model_outcome(X_train,y_train, X_test, y_test)