# Setup

install

In [None]:
!pip install -q sentence-transformers



import

In [None]:
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
from sklearn import svm, neighbors, ensemble, neural_network, linear_model
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

from os.path import exists
import os 
import pickle
import warnings
warnings.filterwarnings("ignore")

settings

In [None]:
model_name = 'nli-mpnet-base-v2'
train_dataset = 'train_clean_with_emoticons'
validation_dataset = train_dataset.replace('train', 'validation')

# use for local
#dataset_path = '../datasets'
#embeddings_path = '../embeddings'
#model_path = '../model'

# use for google colab
from google.colab import drive
drive.mount('/content/drive')
dataset_path = 'drive/MyDrive/Colab Notebooks/siap/datasets' 
embeddings_path = 'drive/MyDrive/Colab Notebooks/siap/embeddings'
model_path = 'drive/MyDrive/Colab Notebooks/siap/model_augmented'
assert os.path.isdir(dataset_path)
assert os.path.isdir(embeddings_path)
assert os.path.isdir(model_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


utility functions

In [None]:
def read_pickle(path):
  with open(path, "rb") as f:
    return pickle.load(f)

def write_pickle(path, object):
  with open(path, "wb") as f:
    return pickle.dump(object, f)

def get_indices(condition, array):
    return [i for i, elem in enumerate(array) if condition(elem)]
    
def filter_by_indices(indices, to_filter):
    return [l for i, l in enumerate(to_filter) if i in indices]

def show_confusion_matrix(conf_matrix, label_names, title=''):
  fig, ax = plt.subplots(figsize=(10, 10))
  disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix,display_labels=label_names)
  disp.plot(include_values=True,cmap=plt.cm.Blues, ax=ax, values_format=None)
  if title:
    plt.title(title)
  plt.show()

def get_embeddings_df(model_name, dataset_name, column_name, df):
  embedding_file_name = f'{column_name}_{dataset_name}_{model_name}.pkl'
  embeddings_file_path = os.path.join(embeddings_path, embedding_file_name) 
  if exists(embeddings_file_path):
    print('reading from pickle...')
    return read_pickle(embeddings_file_path)
  else:
    print('calculating...')
    model = SentenceTransformer(model_name)
    embeddings = model.encode(list(df[column_name].values))
    write_pickle(embeddings_file_path, embeddings)
    return embeddings

def get_embeddings(reviews, embeddings_file_path):
  if exists(embeddings_file_path):
    print('reading from pickle...')
    return read_pickle(embeddings_file_path)
  else:
    print('calculating...')
    model = SentenceTransformer(model_name)
    embeddings = model.encode(reviews)
    write_pickle(embeddings_file_path, embeddings)
    return embeddings

read dataset

In [None]:
df_train = pd.read_csv(os.path.join(dataset_path, f'{train_dataset}.csv'))
df_validation = pd.read_csv(os.path.join(dataset_path, f'{validation_dataset}.csv'))

# Embed

In [44]:
X_train = get_embeddings_df(model_name, train_dataset, 'Review Text', df_train)
y_train = df_train['Rating']
X_validation = get_embeddings_df(model_name, validation_dataset, 'Review Text', df_validation)
y_validation = df_validation['Rating']

reading from pickle...
reading from pickle...


In [45]:
paraphrases = read_pickle(os.path.join(dataset_path, 'paraphrases_dict_final.pkl'))
paraphrases_list = [review for batch in paraphrases.values() for review in batch]
paraphrases_embeddings = get_embeddings(paraphrases_list, 'paraphrases_no_5_ratings_times_5.pkl')

y_paraphrases = []
num_paraphrases = 5
for review_id in paraphrases.keys():
  rating = df_train[df_train['Review ID'] == review_id]['Rating'].values[0]
  for i in range(num_paraphrases):
    y_paraphrases.append(rating)

reading from pickle...


In [46]:
paraphrases_embeddings_np = np.array(paraphrases_embeddings)
y_paraphrases_np = np.array(y_paraphrases)

X_train = np.concatenate((X_train, paraphrases_embeddings_np), axis=0)
y_train = np.concatenate((y_train, y_paraphrases_np), axis=0)

# Train

In [47]:
clf_model_name = 'svm_linear_c1_augmented'

clf = svm.SVC(kernel='linear', random_state=1, C=1) # svm_linear_c1
#clf = svm.SVC(kernel='rbf', C=0.1, random_state=1)
#clf = neighbors.KNeighborsClassifier()
#clf = ensemble.RandomForestClassifier(n_estimators=300, max_depth=10)
#clf = neural_network.MLPClassifier(random_state=1, early_stopping=True, alpha=0.01, hidden_layer_sizes=[600, 600, 600])
#clf = linear_model.LogisticRegressionCV(multi_class='multinomial')
#clf = LinearRegression(n_jobs = -1) # linear_regression
 
model_file_path = os.path.join(model_path, clf_model_name + '.pkl')
y_train_pred_path = os.path.join(model_path, clf_model_name + '_y_train_pred' + '.pkl')
y_validation_pred_path = os.path.join(model_path, clf_model_name + '_y_validation_pred' + '.pkl')

if os.path.exists(model_file_path):
  clf = read_pickle(model_file_path)
  y_train_pred = read_pickle(y_train_pred_path)
  y_validation_pred = read_pickle(y_validation_pred_path)
else:
  clf.fit(X_train, y_train)
  y_train_pred = clf.predict(X_train)
  y_validation_pred = clf.predict(X_validation)

  write_pickle(model_file_path, clf)
  write_pickle(y_train_pred_path, y_train_pred)
  write_pickle(y_validation_pred_path, y_validation_pred)

if clf_model_name == 'linear_regression':
  y_train_pred = list(map(lambda x: round(x), y_train_pred))
  y_validation_pred = list(map(lambda x: round(x), y_validation_pred))

print('Train >>>', f1_score(y_train, y_train_pred, average='micro'))
print('Validation >>>', f1_score(y_validation, y_validation_pred, average='micro'))
print('===========================================')

Train >>> 0.6674819971605973
Validation >>> 0.610079575596817


In [48]:
if clf_model_name == 'linear_regression':
  plt.hist(y_validation_pred)
  plt.show()

In [49]:
if clf_model_name == 'linear_regression':
  plt.hist(y_validation)
  plt.show()

Tried to reduce dimension of review vectors to 10/15/20 but gives only 0.6 f1 validation score

# Analyze predictions

## Classification report

F1 score types: 

1. micro -
Calculate metrics globally by counting the total true positives, false negatives and false positives.

2. macro -
Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

3. weighted -
Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.

In [50]:
print('Training')
print(classification_report(y_train, y_train_pred, target_names=['1', '2', '3', '4', '5']))

Training
              precision    recall  f1-score   support

           1       0.64      0.48      0.55      3942
           2       0.57      0.39      0.46      7422
           3       0.54      0.58      0.56     13556
           4       0.71      0.80      0.75     23505
           5       0.80      0.76      0.78     10038

    accuracy                           0.67     58463
   macro avg       0.65      0.60      0.62     58463
weighted avg       0.66      0.67      0.66     58463



In [51]:
print('Validation')
print(classification_report(y_validation, y_validation_pred, target_names=['1', '2', '3', '4', '5']))

Validation
              precision    recall  f1-score   support

           1       0.42      0.30      0.35        82
           2       0.33      0.22      0.26       157
           3       0.43      0.50      0.46       282
           4       0.40      0.55      0.46       495
           5       0.83      0.73      0.78      1246

    accuracy                           0.61      2262
   macro avg       0.48      0.46      0.46      2262
weighted avg       0.64      0.61      0.62      2262

