# Sentiment Analysis
This model was developed for Shopee Code League 2020
The goal of this was to read in a dataset of text and give a sentiment analysis  (rating 1-5)

In [1]:
from matplotlib import pyplot as plt
import numpy as np
import sklearn
import pandas as pd
import tensorflow as tf
import re
import nltk
from tensorflow import keras

from nltk.tokenize import sent_tokenize, word_tokenize,wordpunct_tokenize, TweetTokenizer,RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics #accuracy calculation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Data upload

In [12]:
train = pd.read_csv("datasets/sentiment analysis/train.csv")
test = pd.read_csv("datasets/sentiment analysis/test.csv")

# Data processing

The processing below mainly focused on cleaning out the text data and handle unbalanced datasets.

In [13]:
def cleanReview(review):
    letters = re.sub("[^a-zA-Z]", " ", review)
    letters = letters.lower()
    tokens = nltk.word_tokenize(letters)
    stops = set(nltk.corpus.stopwords.words("english"))
    words = [w for w in tokens if not w in stops]
#     words = [nltk.stem.SnowballStemmer('english').stem(w) for w in words]
    words = [nltk.stem.WordNetLemmatizer().lemmatize(w) for w in words]
    return " ".join(words)
    
train['review'] = train['review'].apply(cleanReview)
test['review'] = test['review'].apply(cleanReview)
ratings_count = train.groupby(['rating']).count()
unique_train = train.drop_duplicates(subset=['review'])

#By undersampling
refined_train1 = train[train['rating']==2]
refined_train1.dropna(inplace=True)
for i in range(1, 6):
    if i!=2:
        temp_sample = train.where(train['rating']==i)
        temp_sample.dropna(inplace=True)
        refined_train1 = refined_train1.append(temp_sample.sample(12705, replace=False, random_state=17), ignore_index=True)
print("refined_train1")       
print(refined_train1.groupby(['rating']).count())
print("*********************************************")
#By oversampling
refined_train2 = train
refined_train2.dropna(inplace=True)
for i in range(1, 6):
    if i!=4:
        temp_sample = unique_train.where(unique_train['rating']==i)
        temp_sample.dropna(inplace=True)
        if(i==1 or i==2):
            temp_sample = temp_sample.append(temp_sample, ignore_index=True)
            temp_sample = temp_sample.append(temp_sample, ignore_index=True)
        add = 41865 - ratings_count.iloc[i-1,0]
        refined_train2 = refined_train2.append(temp_sample.sample(add, replace=False, random_state=17), ignore_index=True)
print("refined_train2")       
print(refined_train2.groupby(['rating']).count())
print("*********************************************")
#By oversampling and undersampling (balanced dataset of 30000 of each)
refined_train3 = train[train['rating']<3]
refined_train3.dropna(inplace=True)
for i in range(1, 6):
    if(i==1 or i==2):
        temp_sample = unique_train.where(unique_train['rating']==i)
        temp_sample.dropna(inplace=True)
        temp_sample = temp_sample.append(temp_sample, ignore_index=True)
        add = 30000 - ratings_count.iloc[i-1,0]
        refined_train3 = refined_train3.append(temp_sample.sample(add, replace=False, random_state=17), ignore_index=True)
    else:
        temp_sample = train.where(train['rating']==i)
        temp_sample.dropna(inplace=True)
        refined_train3 = refined_train3.append(temp_sample.sample(30000, replace=False, random_state=17), ignore_index=True)
print("refined_train3")       
print(refined_train3.groupby(['rating']).count())
print("*********************************************")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


refined_train1
        review_id  review
rating                   
1.0         12705   12705
2.0         12705   12705
3.0         12705   12705
4.0         12705   12705
5.0         12705   12705
*********************************************
refined_train2
        review_id  review
rating                   
1.0         41865   41865
2.0         41865   41865
3.0         41865   41865
4.0         41865   41865
5.0         41865   41865
*********************************************


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


refined_train3
        review_id  review
rating                   
1.0         30000   30000
2.0         30000   30000
3.0         30000   30000
4.0         30000   30000
5.0         30000   30000
*********************************************


## Model 1
Tensorflow NN with tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

def Model1(selected_train, stop):
    accuracy = 0
    #Tokenization and embeddings
    review = selected_train.review.values
    tokenizer = Tokenizer(num_words=9000, lower=True)
    tokenizer.fit_on_texts(review)
    encoded_docs = tokenizer.texts_to_sequences(review)
    padded_sequence = pad_sequences(encoded_docs, maxlen=500)

    #Model building
    embedding_vector_length = 64
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        verbose=1,
        patience=10,
        mode='max',
        restore_best_weights=True)
    model = Sequential()
    model.add(Embedding(9000, embedding_vector_length,     
                                         input_length=500) )
    model.add(SpatialDropout1D(0.25))
    model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
    model.add(Dropout(0.2))
    model.add(Dense(6, activation='relu'))
    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam', 
                               metrics=['accuracy'])
    print(model.summary())

    #Train the model
    if(stop):
        model.fit(padded_sequence, selected_train.rating.values, callbacks = [early_stopping],
                          validation_split=0.1, epochs=3, batch_size=64)
    else:
        model.fit(padded_sequence, selected_train.rating.values,
                      validation_split=0.1, epochs=3, batch_size=64)

    #Use model to predict on test data
    review = test.review.values
    tokenizer.fit_on_texts(review)
    encoded_docs = tokenizer.texts_to_sequences(review)
    padded_sequence = pad_sequences(encoded_docs, maxlen=500)
    predicted = model.predict(padded_sequence)

    #getting classification results from predicted values
    results = test
    results['rating'] = np.nan
    for i in range(predicted.shape[0]):
        result = np.where(predicted[i] == np.amax(predicted[i]))
        results.iloc[i, 2] = result[0]

    #Converting results to csv
    results = results.drop(columns=['review'])
    results.rating = results.rating.astype(int)
    return results

## Model 2
Count Vectorizer from nltk package, using MultinomialNB

In [None]:
#Build and train the model
def Model2(selected_train):
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
    text_counts= cv.fit_transform(selected_train['review'])

    X_train, X_test, y_train, y_test = train_test_split(text_counts, selected_train['rating'], test_size=0.1, random_state=17)
    clf = MultinomialNB().fit(X_train, y_train)
    predicted = clf.predict(X_test)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, predicted))

    clf = MultinomialNB().fit(text_counts, selected_train['rating'])
    test2 = cv.transform(test['review'])
    test2.shape
    predicted = clf.predict(test2)
    results = test
    results['rating'] = predicted
    results = results.drop(columns=['review'])
    results.rating = results.rating.astype(int)
    return results

## Model 3
Using a TFIDF vectorizer

In [None]:
def Model3(selected_train):
    tfidf = TfidfVectorizer()
    text_tfidf = tfidf.fit_transform(selected_train['review'])

    X_train, X_test, y_train, y_test = train_test_split(text_tfidf, selected_train['rating'], test_size=0.1, random_state=17)
    clf = MultinomialNB().fit(X_train, y_train)
    predicted = clf.predict(X_test)
    print("MultinomialNB Accuracy:", metrics.accuracy_score(y_test, predicted)) 

    clf = MultinomialNB().fit(text_tfidf, selected_train['rating'])
    test3 = tfidf.transform(test['review'])
    predicted= clf.predict(test3)
    results = test
    results['rating'] = predicted
    results = results.drop(columns=['review'])
    results.rating = results.rating.astype(int)
    return results

## Model 4
Combining TF-IDF and Count Vectorizer with pipeline, using hyperparameter tuning

In [14]:
def Model4(selected_train):
    %env JOBLIB_TEMP_FOLDER=/tmp
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=42,
                              max_iter=5, tol=None))
    ])
    parameters = {
        'vect__ngram_range': [(1,1), (1,2)],
        'tfidf__use_idf': (True, False),
        'clf__alpha': (1e-2, 1e-3),
        'clf__loss': ('hinge', 'modified_huber', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss',
                'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
        'clf__penalty': ('l1', 'l2', 'elasticnet')
    }
    X_train, X_test, y_train, y_test = train_test_split(selected_train['review'], selected_train['rating'], test_size=0.1, random_state=17)
    text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print("Pipeline accuracy:", metrics.accuracy_score(y_test, predicted))
    
    gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
    gs_clf.fit(X_train, y_train)
    predicted = gs_clf.predict(X_test)
    print("GridSearchCV accuracy:", metrics.accuracy_score(y_test, predicted))
    predicted = gs_clf.predict(test.review)
    results = test
    results['rating'] = predicted
    results = results.drop(columns=['review'])
    results.rating = results.rating.astype(int)
    return results

## Apply various models to various datasets
The result shown is for the best generated

In [18]:
# results1 = Model1(refined_train1, False)
# results2 = Model1(refined_train2, True)
# results3 = Model1(refined_train3, False)
# results4 = Model2(refined_train1)
# results5 = Model2(refined_train2)
# results6 = Model2(refined_train3)
# results7 = Model3(refined_train1)
# results8 = Model3(refined_train2)
# results9 = Model3(refined_train3)
results10 = Model4(refined_train3)

env: JOBLIB_TEMP_FOLDER=/tmp
Pipeline accuracy: 0.4623333333333333
GridSearchCV accuracy: 0.6063333333333333


In [19]:
# To print results
results10.to_csv("datasets/sentiment analysis/results.csv", header=True, index=False)