# Setup

install

In [2]:
!pip install -q sentence-transformers;

[K     |████████████████████████████████| 79 kB 5.1 MB/s 
[K     |████████████████████████████████| 3.8 MB 16.0 MB/s 
[K     |████████████████████████████████| 1.2 MB 60.4 MB/s 
[K     |████████████████████████████████| 67 kB 4.8 MB/s 
[K     |████████████████████████████████| 6.5 MB 51.5 MB/s 
[K     |████████████████████████████████| 895 kB 10.2 MB/s 
[K     |████████████████████████████████| 596 kB 60.6 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


import

In [3]:
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import tensorflow_hub as hub
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn import svm, neighbors, ensemble, neural_network, linear_model

from os.path import exists
import os 
import pickle
import warnings
warnings.filterwarnings("ignore")

utility functions

In [4]:
def read_pickle(path):
  with open(path, "rb") as f:
    return pickle.load(f)

def write_pickle(path, object):
  with open(path, "wb") as f:
    return pickle.dump(object, f)

In [5]:
def get_embeddings(model_name, dataset_name, column_name, df):
  embedding_file_name = f'{column_name}_{dataset_name}_{model_name}.pkl'
  embeddings_file_path = os.path.join(embeddings_path, embedding_file_name) 
  if exists(embeddings_file_path):
    print('reading from pickle...')
    return read_pickle(embeddings_file_path)
  else:
    print('calculating...')
    model = SentenceTransformer(model_name)
    embeddings = model.encode(list(df[column_name].values))
    write_pickle(embeddings_file_path, embeddings)
    return embeddings

settings

In [6]:
model_name = 'nli-mpnet-base-v2'
train_dataset = 'train_clean_with_emoticons'
validation_dataset = train_dataset.replace('train', 'validation')

# use for local
#dataset_path = '../datasets'

# use for google colab
from google.colab import drive
drive.mount('/content/drive')
dataset_path = 'drive/MyDrive/siap/datasets' 
embeddings_path = 'drive/MyDrive/siap/embeddings'
assert os.path.isdir(dataset_path)
assert os.path.isdir(embeddings_path)

Mounted at /content/drive


read dataset

In [7]:
df_train = pd.read_csv(os.path.join(dataset_path, f'{train_dataset}.csv'))
df_validation = pd.read_csv(os.path.join(dataset_path, f'{validation_dataset}.csv'))

In [9]:
row = df_train.iloc[18000]
print(row['Review Text'])
print(row['Augmented review text'])

I purchased this dress in the berry color. it is beautiful and feminine. the length fit just like in the picture. i love all the detail and ease of this dress. i'm 5'2" 34d and purchased a size 2. it fits true to size.
The dress was purchased in a berry color. It's feminine and beautiful. The length matches the picture. The dress is easy to wear and I love it. I bought a size 2 because I am 5'2" 34d. 


# Embed

In [14]:
X_train = get_embeddings(model_name, train_dataset, 'Review Text', df_train)
y_train = df_train['Rating']
X_validation = get_embeddings(model_name, validation_dataset, 'Review Text', df_validation)
y_validation = df_validation['Rating']

reading from pickle...
reading from pickle...


In [15]:
if 'Augmented review text' in df_train:
    X_train_augmented = get_embeddings(model_name, train_dataset, 'Augmented review text', df_train)
    print(X_train.shape)
    print(X_train_augmented.shape)
    X_train = np.concatenate((X_train, X_train_augmented), axis=0)
    y_train = np.concatenate((y_train, y_train), axis=0)
    print(X_train.shape)
    print(y_train.shape)

reading from pickle...
(18113, 768)
(18113, 768)
(36226, 768)
(36226,)


# Train

In [32]:
clf = svm.SVC(kernel='linear', random_state=1, C=1)
#clf = svm.SVC(kernel='rbf', C=0.1, random_state=1)
#clf = neighbors.KNeighborsClassifier()
#clf = ensemble.RandomForestClassifier(n_estimators=300, max_depth=10)
#clf = neural_network.MLPClassifier(random_state=1, early_stopping=True, alpha=0.01, hidden_layer_sizes=[600, 600, 600])
#clf = linear_model.LogisticRegressionCV(multi_class='multinomial')

clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_validation_pred = clf.predict(X_validation)

print('Train >>>', f1_score(y_train, y_train_pred, average='micro'))
print('Validation >>>', f1_score(y_validation, y_validation_pred, average='micro'))
print('===========================================')

Train >>> 0.7009606360072875
Validation >>> 0.6724137931034483


# Metrics

F1 score types: 

1. micro -
Calculate metrics globally by counting the total true positives, false negatives and false positives.

2. macro -
Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

3. weighted -
Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.

In [None]:
print('Training')
print(classification_report(y_train, y_train_pred, target_names=['1', '2', '3', '4', '5']))

In [None]:
print('Validation')
print(classification_report(y_validation, y_validation_pred, target_names=['1', '2', '3', '4', '5']))