## Train

In [2]:
import logging
import re
import string
import time
import ast
import pandas as pd
from google.colab import drive
from typing import Tuple, Union, List, Dict

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

level = logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger(__name__)


class TFModel(tf.Module):
    def __init__(self, model: tf.keras.Model) -> None:
        self.model = model

class ModelTrainer:
    def __init__(self) -> None:
        self.tf_model_wrapper: TFModel

        # Model Architecture parameters
        self.max_features = 50000
        self.epochs = 25
        self.batch_size = 64
        self.padding_token = "<pad>"
        self.auto = tf.data.AUTOTUNE

    def read_train(self, dir_train):
      train_df = pd.read_csv(dir_train, index_col=0)
      train_df['fixedFee_tags'] = train_df['fixedFee_tags'].apply(ast.literal_eval)
      train_df['text'] = train_df['text'].apply(str)
      return train_df

    def vocabulary_size(self, train_df):
      vocabulary = set()
      train_df["text"].str.lower().str.split().apply(vocabulary.update)
      vocabulary_size = len(vocabulary)
      return vocabulary_size

    def make_dataset(self, train_df, is_train=True):
      labels = tf.ragged.constant(train_df["fixedFee_tags"].values)
      lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
      lookup.adapt(labels)
      label_binarized = lookup(labels).numpy()
      dataset = tf.data.Dataset.from_tensor_slices(
          (train_df["text"].values, label_binarized)
        )
      dataset = dataset.shuffle(self.batch_size) if is_train else dataset
      return dataset.batch(self.batch_size)

    def dataset(self, train_df):
      train_dataset = self.make_dataset(train_df, is_train = True)
      text_batch, label_batch = next(iter(train_dataset))
      text_batch = text_batch.numpy()
      label_batch = label_batch.numpy()
      return text_batch, label_batch

    def init_vectorize_layer(self, vocabulary_size, text_dataset: np.ndarray) -> TextVectorization:
      text_vectorizer = TextVectorization(max_tokens=vocabulary_size,
                                          ngrams=2,
                                          output_mode='tf_idf')
      with tf.device("/CPU:0"):
        text_vectorizer.adapt(text_dataset)
      return text_vectorizer

    def init_model(self, train_df, vocabulary_size, text_dataset: np.ndarray) -> tf.keras.Model:
        text_batch, label_batch = self.dataset(train_df)
        vectorize_layer = self.init_vectorize_layer(text_dataset=text_batch, 
                                                    vocabulary_size=vocabulary_size)
        raw_input = tf.keras.Input(shape=(1,), dtype=tf.string)
        x = vectorize_layer(raw_input)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        predictions = tf.keras.layers.Dense(38, 
                                            activation='sigmoid')(x)
        model = tf.keras.Model(raw_input, predictions)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
        return model

    def train(self) -> None:
        drive.mount('/content/drive')
        dir = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/data/service_budget_train_df.csv'  
        train_df = self.read_train(dir)
        vocabulary_size = self.vocabulary_size(train_df)
        text_batch, label_batch = self.dataset(train_df)
        model = self.init_model(train_df, text_dataset=text_batch, 
                                vocabulary_size=vocabulary_size)
        model.fit(text_batch, label_batch, epochs=self.epochs)
        self.tf_model_wrapper = TFModel(model)
        path = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/model/services/'
        model.save(path + 'budget_label_model/my_model')
        logger.info('saving SavedModel to budget_label_model/my_models')

if __name__ == '__main__':
    model_trainer = ModelTrainer()
    model_trainer.train()

Mounted at /content/drive
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25




# Predict

### Load Model

In [3]:
path = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/model/services/'
service_budget_model = tf.keras.models.load_model(path + 'budget_label_model/my_model')

### read vocab and make predict

In [4]:
vocab = []
with open(r'/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/data/service_budget_vocab.txt', 'r') as fp:
  for line in fp:
    x = line[:-1]
    vocab.append(x)

user_input = pd.Series(str(input('Text (S): ')))
predicted_probabilities = service_budget_model(user_input)
for i, text in enumerate(user_input):
    prediction = [x for _, x in sorted(zip(predicted_probabilities[i], vocab),
                                       key=lambda pair: pair[0],
                                       reverse=True)][:2]
    print(prediction)

Text (S): jasa service hp
['200 - 250 Ribu', '20 - 50 Ribu']


In [5]:
user_input = pd.Series(str(input('Text (S): ')))
predicted_probabilities = service_budget_model(user_input)
for i, text in enumerate(user_input):
    print(f"Text: {text}")
    prediction = sorted(zip(predicted_probabilities[i], vocab),
                        key=lambda pair: pair[0],
                        reverse=True)
    top_1 = prediction[0][1] 
    top_2 = prediction[1][1]
    output = [top_1 + ", " + top_2]
    print(output)

Text (S): jasa service hp
Text: jasa service hp
['200 - 250 Ribu, 20 - 50 Ribu']
