# Mounts and Imports

In [1]:
# Mount workspace
from google.colab import drive
drive.mount('/content/drive')
!unzip -o -u "/content/drive/My Drive/ASU/Spring24/DataMining/DM-Project/product-review-sentiment-analysis.zip" -d "."

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/drive/My Drive/ASU/Spring24/DataMining/DM-Project/product-review-sentiment-analysis.zip


In [5]:
# Imports
# external imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# internal imports
from preprocessors import loaders
from utils import constants
from visualization import visualization
from evaluators import evaluators
from models.RNN import lstm

# Amazon Dataset

## Load Dataset

In [3]:
amazon_features = loaders.load_all_features(constants.DatasetAmazon, binary_labels=True, max_rows=1000, root_path="/content/drive/My Drive/ASU/Spring24/DataMining/product-review-sentiment-analysis")
df_sizes = {"Train dataset":[], "Test dataset": []}
index = []
for feature in amazon_features:
  index.append(feature)
  df_sizes["Train dataset"].append(amazon_features[feature][0].shape)
  df_sizes["Test dataset"].append(amazon_features[feature][1].shape)
print(pd.DataFrame(df_sizes, index=amazon_features.keys()))
sample_train, sample_test = list(amazon_features.values())[0]
visualization.plot_label_distribution(sample_train, sample_test)

TypeError: load_all_features() got an unexpected keyword argument 'root_path'

In [6]:
train_df, test_df = loaders.load_feature(constants.DatasetAmazon, constants.FeatureTFIDF, binary_labels=True, max_rows=10000, root_path="/content/drive/My Drive/ASU/Spring24/DataMining/product-review-sentiment-analysis")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [12]:
train_df = pd.read_csv("/content/drive/My Drive/ASU/Spring24/DataMining/product-review-sentiment-analysis/Data/features/Amazon/CountVectorizer/train.csv")
#test_df = pd.read_csv("/content/drive/My Drive/ASU/Spring24/DataMining/product-review-sentiment-analysis/Data/features/TFIDF/amazon/test.csv

In [18]:
train_df[:10000].to_csv("/content/drive/My Drive/ASU/Spring24/DataMining/product-review-sentiment-analysis/Data/features/Amazon/CountVectorizer/train_new.csv", index=False)

In [None]:
lstm.Classifier(train_df[constants.ColumnLabel], learning_rate=0.003)

# Fine-tuning the model

In [None]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import numpy as np

class Classifier:
    def __init__(self, labels, output_activation='softmax', optimizer='adam', learning_rate=0.001, loss='sparse_categorical_crossentropy', metrics=['accuracy']):
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)
        self.model = tf.keras.Sequential([
            tf.keras.layers.LSTM(128, return_sequences=True),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.LSTM(128),
            tf.keras.layers.Dense(len(self.label_encoder.classes_), activation=output_activation)])
        self.model.compile(optimizer=self.__get_optimizer(optimizer, learning_rate), loss=loss, metrics=metrics)

    def __get_optimizer(self, optimizer, learning_rate):
        return tf.keras.optimizers.Adam(learning_rate=learning_rate)

    def fit(self, train_df, time_steps=1, epochs=10):
        X = train_df.drop([constants.ColumnLabel], axis=1).values
        Y = self.label_encoder.transform(train_df[constants.ColumnLabel])

        # Reshape input data to be 3-dimensional (batch_size, time_steps, input_dim)
        X = X.reshape(X.shape[0], time_steps, -1)

        self.model.fit(X, Y, epochs=epochs)

    def predict(self, test_df, time_steps=1):
        if constants.ColumnLabel in test_df.columns:
            test_df = test_df.drop([constants.ColumnLabel], axis=1)

        X = test_df.values.astype(np.float32)  # Convert data to float32
        # Reshape input data to be 3-dimensional (batch_size, time_steps, input_dim)
        X = X.reshape(X.shape[0], time_steps, -1)

        y_pred = self.model.predict(X)
        y_pred = np.argmax(y_pred, axis=1)
        return self.label_encoder.inverse_transform(y_pred)







### Fine-tuning w.r.t Features

In [None]:
data_df = amazon_features[constants.FeatureTFIDF][0]
model_initializer = lambda df, param_name: (amazon_features[param_name][0], Classifier(df[constants.ColumnLabel]))
accuracies_df, ax = evaluators.kfold_parameter_tune(data_df, parameters = amazon_features.keys(), parameter_name = "Feature Methods", model_init = model_initializer, title="Accuracies w.r.t feature methods")

Fine-tuning w.r.t. parameters

In [None]:
data_df = amazon_features[constants.FeatureTFIDF][0]
model_initializer = lambda df, param: (df, Classifier(data_df[constants.ColumnLabel], learning_rate=param))
accuracies_df, ax = evaluators.kfold_parameter_tune(data_df, parameters = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1], parameter_name = "Learning rates", model_init = model_initializer, title="accuracies w.r.t learning rate")
plt.show()

## Evaluation

In [None]:
train_df, test_df = amazon_features[constants.FeatureTFIDF]
model = Classifier(train_df[constants.ColumnLabel], learning_rate=0.001)
amazonEvalResults = evaluators.evaluate(model, train_df, test_df, evaluator_name="Amazon")
print("Training and testing accuracies for amazon dataset are %f and %f respectively" %amazonEvalResults.get_accuracies())

In [None]:
amazonEvalResults.plot_confusion_matrices()

In [None]:
# Test dataset classification report
amazonEvalResults.display_classification_report(False)

# Yelp Dataset


## Load Dataset

In [None]:
yelp_features = loaders.load_all_features(constants.DatasetYelp)
df_sizes = {"Train dataset":[], "Test dataset": []}
index = []
for feature in yelp_features:
  index.append(feature)
  df_sizes["Train dataset"].append(yelp_features[feature][0].shape)
  df_sizes["Test dataset"].append(yelp_features[feature][1].shape)
print(pd.DataFrame(df_sizes, index=yelp_features.keys()))
sample_train, sample_test = list(yelp_features.values())[0]
visualization.plot_label_distribution(sample_train, sample_test)

## Fine-tuning the model

### Fine-tuning w.r.t. Features


In [None]:
data_df = yelp_features[constants.FeatureCountVectorizer][0]
model_initializer = lambda df, param_name: (yelp_features[param_name][0], NN.Classifier(df[constants.ColumnLabel]))
accuracies_df, ax = evaluators.kfold_parameter_tune(data_df, parameters = yelp_features.keys(), parameter_name = "Feature Methods", model_init = model_initializer, title="Accuracies w.r.t feature methods")

### Fine-tuning w.r.t. parameters

In [None]:
data_df = yelp_features[constants.FeatureCountVectorizer][0]
model_initializer = lambda df, param: (df, NN.Classifier(data_df[constants.ColumnLabel], learning_rate=param))
accuracies_df, ax = evaluators.kfold_parameter_tune(data_df, parameters = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1], parameter_name = "Learning rates", model_init = model_initializer, title="accuracies w.r.t learning rate")
plt.show()

## Evaluation

In [None]:
selected_feature = constants.FeatureCountVectorizer
train_df, test_df = yelp_features[selected_feature]
model = NN.Classifier(train_df[constants.ColumnLabel], learning_rate=0.03)
yelpEvalResults = evaluators.evaluate(model, train_df, test_df, evaluator_name="Yelp")
print("Training and testing accuracies for yelp dataset are %f and %f respectively" %yelpEvalResults.get_accuracies())

In [None]:
yelpEvalResults.plot_confusion_matrices()

In [None]:
# Test dataset classification report
yelpEvalResults.display_classification_report(False)

# Comparing performance of the final model

In [None]:
visualization.plot_accuracies_from_results([amazonEvalResults, yelpEvalRsults], "Ecommerce platform")

In [None]:
visualization.plot_pred_time_from_results([amazonEvalResults, yelpEvalResults], "Ecommerce platform")