In [11]:
# DON'T CHANGE this part: import libraries
import numpy as np
import scipy
import json
from nltk.stem import PorterStemmer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import re
import itertools

In [12]:
# DON'T CHANGE this part: read data path
train_set_path, valid_set_path, random_number = input().split()

In [13]:
# TODO:
# 1. preprocess: converting text to lowercase, coverting number, tokenization, removing stopword, stemming
# 2. embedding: hitogram matrix
# 3. classifier using linear regression
# 4. accuracy (for metric)

In [14]:
# Class Rate init with input
class Rate:
  def __init__(self, train_set_path, valid_set_path, random_number):
    # Store input
    self.train_set_path = train_set_path
    self.valid_set_path = valid_set_path
    self.random_number = int(random_number)
    self.valid_at_random_number = []
    self.train_vocab = []

    # Get A, B of train and valid data (after preprocess and embedding json data)
    self.train_A, self.train_B = self.processing_input(is_training=True)
    self.valid_A, self.valid_B = self.processing_input(is_training=False)

    # Calculate x_hat (classifier using linear regression)
    self.x_hat = self.classifier()

    # Calculate M2 accuracy
    self.m2_accuracy = self.accuracy()

  # 1. Preprocess
  def preprocess(self, text, is_training):
    # Converting text to lowercase by lower()
    text = text.lower()
    # Converting number to 'num' word using RegEx, number is determined by r'[0-9]+', replace using re.sub()
    text = re.sub(r'[0-9]+', 'num', text)
    # Tokenization using word_tokenize()
    tokens = word_tokenize(text)
    # Removeing stopword by check w in stopwords.words() or not
    tokens = [w for w in tokens if not w in stopwords.words()]
    # Stemming by PorterStemmer().stem() function
    ps = PorterStemmer()
    stemming = [ps.stem(w) for w in tokens]
    # Convert to 'unk' when not training (validating)
    if not is_training:
      # If word not in train_vocab, then convert it to 'unk'
      stemming = [w if w in self.train_vocab else 'unk' for w in stemming]
    # Return text preprocessed
    return stemming

  # 2. Embedding
  def embedding(self, docs, is_training):
    # Create unique vocab from training input if training
    if is_training:
      unique = set()
      for doc in docs:
        for word in doc:
          unique.add(word)
      unique = list(unique)
      # Add 'unk' word to unique if it's not there, to handle 'unk' word from valid if valid's word not in train_vocab
      if 'unk' not in unique:
        unique.append('unk')
      # Copy unique list to train_vocab to handle data when validating
      self.train_vocab = unique[:]
    # If not training (validating), then unique also is train_vocab (unique vocab from training input we created)
    if not is_training:
      unique = self.train_vocab
    # Return Word count vector for all document in input
    return np.array([[doc.count(word) for word in unique] for doc in docs])

  # 3. Classifier
  def classifier(self):
    # Return x_hat from train_A and train_B
    return np.linalg.pinv(self.train_A) @ self.train_B

  # 4. Accuracy
  def accuracy(self):
    # vB is label of valid data, got from calculate softmax of valid_B on row (axis=1), then return argmax (index of max softmax) on  row (axis=1) then plus 1 (since index 0 is rate 1, index 1 is rate 2,...)
    vB = np.argmax(scipy.special.softmax(self.valid_B, axis=1), axis=1) + 1
    # cB is label of valid data, got from formula valid_A @ x_hat, then using softmax and argmax as above
    cB = np.argmax(scipy.special.softmax(self.valid_A @ self.x_hat, axis=1), axis=1) + 1
    # Evaluate accuracy base on formula sum(right) / total. sum(right) is sum of equal pairs between vB and cB, total is width of vB (or cB)
    return np.sum(vB == cB) / vB.shape[0]

  # Standardized lables to vector
  def standardized(self, lables):
    res = []
    for label in lables:
      # Create standard is zeros 5D vector (rate 1 to 5)
      standard = [0, 0, 0, 0, 0]
      # Set standard at index [label - 1] to 1
      standard[int(label)-1] = 1
      # Add standard to result
      res.append(standard)
    # Return numpy array of vector of standardized labels
    return np.array(res)

  def processing_input(self, is_training=False):
    # Get path of input (train or valid) depend on is_training variable
    path = self.train_set_path if is_training else self.valid_set_path
    # Open file input from path
    with open(path, 'r') as j:
        # Load json data from file input
        json_data = json.load(j)
        # 1. Preprocess json data at 'reviewText' field
        preprocessed_data = [self.preprocess(x['reviewText'], is_training) for x in json_data]
        # If not training (validating), save preprocessed_data at random_number (from input) to valid_at_random_number to return to output
        if not is_training:
          self.valid_at_random_number = preprocessed_data[self.random_number]
        # 2. Embedding preprocessed_data to histogram vector
        embedded_data = self.embedding(preprocessed_data, is_training)
        # Standarized labels ('overall' field) to vectors
        standardized_label = self.standardized([x['overall'] for x in json_data])
        # Return A (embedded_data after add 1 column of 1 to left most of it) and B (standardized_label)
        return np.insert(embedded_data, 0, 1, axis=1), standardized_label

In [15]:
rate = Rate(train_set_path, valid_set_path, random_number)
print (rate.valid_at_random_number)
print ('M2 - {}'.format(rate.m2_accuracy))

['pen', 'never', 'get', 'old', '.', 'prefer', 'unk', 'point', '.', 'love', 'unk', 'color', '.']
M2 - 0.7
