In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np

import nltk
from nltk.stem import RSLPStemmer
nltk.download('rslp')
import string


from data_process import master


from deep_translator import GoogleTranslator

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score,confusion_matrix,f1_score


from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
stopwords_pt = set(stopwords.words("portuguese"))
STOPWORDS = stopwords.words("portuguese")


# reference: https://github.com/CaptainE/RNN-LSTM-in-numpy/blob/master/RNN_LSTM_from_scratch.ipynb


[nltk_data] Downloading package rslp to /Users/jamesli/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [15]:
# Access to df
df = master()

In [16]:
df_review = df.loc[:,['review_score','review_comment_message']]
# remove null data on certain column
df_review = df_review.dropna(subset = ['review_comment_message'])
df_review.columns = ['score','comment']

print(f'Dataset shape: {df_review.shape}')

Dataset shape: (7277068, 2)


In [17]:
# Give certain label to df_review
# Basically split the table into positive or negative reivew
df_review['label'] = np.where(df_review['score'] >=3, 1,0)

# drop column score
df_review.drop(columns = ['score'], inplace = True)

In [18]:
# Option 1: Convert all portuguese into English and conduct processing
def translate_text(text):
    translated = GoogleTranslator(source='auto', target='en').translate(text)
    return translated


In [19]:
# Pipelines & Custom Transformers in scikit-learn
# Reference : https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156
def remove_stopwords(text,stopwords_pt):
    '''
        Remove stopwords from reviews
    :param texto: 
    :return: 
    '''
    text = text.lower()

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords_pt]

    text_clean = (" ".join(filtered_words))


    return text_clean

class TextProcess(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def transform(self, X,y = None):
        texts = []
        for text in X:
            text = text.lower()
            text = re.sub('\n', ' ', text)

            text = re.sub('\r', ' ', text)

            text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' numero ', text)

            text = re.sub(r'R\$', ' ', text)
            text = re.sub(r'\W', ' ', text)
            text = re.sub(r'\s+', ' ', text)
            
            texts.append(text)
        
        return texts

class StopwordsRemoval(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def remove_stopwords(self,text):
        filtered_words = [word for word in text if word.lower() not in stopwords_pt]
        text_clean = (" ".join(filtered_words))
        
        return text_clean
    
    def transform(self, X, y =None):
        text_process = list(map(lambda c:self.remove_stopwords(c), X))
        text_transform = list(map(lambda x:''.join(x), text_process))
        
        return text_transform

class Steam(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        return self
    
    def steamming(self,text):
        stemmer = RSLPStemmer()
        return list(map(lambda x:stemmer.stem(x),[word for word in text.split()]))
    
    def transform(self, X, y = None):
        text_transform = list(map(lambda x:self.steamming(x), X))
        text_transform = list(map(lambda x: ''.join(x), text_transform))
        
        return text_transform
            
            

In [None]:
pipe_preprocessing = Pipeline([
    ('preprocess',TextProcess()),
    ('stopwords', StopwordsRemoval()),
    ('stemming', Steam())
])

text_processed = pipe_preprocessing.fit_transform(df_review['comment'])

### Vectorization

In [None]:
print('Text Processing Pipeline started')
vectorizer = CountVectorizer(binary = True, max_features= 5000, stop_words= stopwords_pt)
X = vectorizer.fit_transform(text_processed).toarray()

y = df_review['label'].values
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.8, random_state =1)



## Models

In [None]:
def matching_criteria(y_true, y_pred):
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("\nROC, AUC:", roc_auc_score(y_true, y_pred))
    print("\nF1-Score:", f1_score(y_true, y_pred, average='weighted'))
    print("\nIts Corresponding Confusion Matrix:")
    sns.heatmap(confusion_matrix(y_true, y_pred), annot=True)
    plt.show()

In [None]:

def model_outcome(X_train,y_train, X_test, y_test):
    models = []
    models.append(('Logistic Regression', LogisticRegression()))
    models.append(('Decision Trees', DecisionTreeClassifier()))
    models.append(('Naive Bayes', GaussianNB()))
    models.append(('SVM', SVC()))
    models.append(('Random Forest', RandomForestClassifier()))

    result = pd.DataFrame(columns = ['Model','Accuracy for Training','Accuracy for Testing','Std'])
    for index,model in enumerate(models):
        kfold = StratifiedKFold(n_splits = 5)
        cross_val_result = cross_val_score(model[1], X_train,y_train, cv = kfold)

        # training the model
        model[1].fit(X_train,y_train)
        predictions = model[1].predict(X_test)
        test_accuracy = accuracy_score(y_test, predictions)

        result.loc[index,'Model'] = model[0]
        result.loc[index,'Accuracy for Training'] = cross_val_result.mean()
        result.loc[index,'Accuracy for Testing'] = test_accuracy
        result.loc[index,'Std'] = cross_val_result.std()


    return result


In [None]:
model_outcome(X_train,y_train, X_test, y_test)