## Train

In [2]:
import logging
import re
import string
import time
import ast
import pandas as pd
from google.colab import drive
from typing import Tuple, Union, List, Dict

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

level = logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger(__name__)


class TFModel(tf.Module):
    def __init__(self, model: tf.keras.Model) -> None:
        self.model = model

class ModelTrainer:
    def __init__(self) -> None:
        self.tf_model_wrapper: TFModel

        # Model Architecture parameters
        self.max_features = 50000
        self.epochs = 50
        self.batch_size = 64
        self.padding_token = "<pad>"
        self.auto = tf.data.AUTOTUNE

    def read_train(self, dir_train):
      train_df = pd.read_csv(dir_train, index_col=0)
      train_df['projects_tags'] = train_df['projects_tags'].apply(ast.literal_eval)
      train_df['text'] = train_df['text'].apply(str)
      return train_df

    def vocabulary_size(self, train_df):
      vocabulary = set()
      train_df["text"].str.lower().str.split().apply(vocabulary.update)
      vocabulary_size = len(vocabulary)
      return vocabulary_size

    def make_dataset(self, train_df, is_train=True):
      labels = tf.ragged.constant(train_df["projects_tags"].values)
      lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
      lookup.adapt(labels)
      label_binarized = lookup(labels).numpy()
      dataset = tf.data.Dataset.from_tensor_slices(
          (train_df["text"].values, label_binarized)
        )
      dataset = dataset.shuffle(self.batch_size) if is_train else dataset
      return dataset.batch(self.batch_size)

    def dataset(self, train_df):
      train_dataset = self.make_dataset(train_df, is_train = True)
      text_batch, label_batch = next(iter(train_dataset))
      text_batch = text_batch.numpy()
      label_batch = label_batch.numpy()
      return text_batch, label_batch

    def init_vectorize_layer(self, vocabulary_size, text_dataset: np.ndarray) -> TextVectorization:
      text_vectorizer = TextVectorization(max_tokens=vocabulary_size,
                                          ngrams=2,
                                          output_mode='tf_idf')
      with tf.device("/CPU:0"):
        text_vectorizer.adapt(text_dataset)
      return text_vectorizer

    def init_model(self, train_df, vocabulary_size, text_dataset: np.ndarray) -> tf.keras.Model:
        text_batch, label_batch = self.dataset(train_df)
        vectorize_layer = self.init_vectorize_layer(text_dataset=text_batch, 
                                                    vocabulary_size=vocabulary_size)
        raw_input = tf.keras.Input(shape=(1,), dtype=tf.string)
        x = vectorize_layer(raw_input)
        x = tf.keras.layers.Dense(512, activation='relu')(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(128, activation='relu')(x)
        predictions = tf.keras.layers.Dense(313, 
                                            activation='sigmoid')(x)
        model = tf.keras.Model(raw_input, predictions)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
        return model

    def train(self) -> None:
        drive.mount('/content/drive')
        dir = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/data/projects_train_df.csv'  
        train_df = self.read_train(dir)
        vocabulary_size = self.vocabulary_size(train_df)
        text_batch, label_batch = self.dataset(train_df)
        model = self.init_model(train_df, text_dataset=text_batch, 
                                vocabulary_size=vocabulary_size)
        model.fit(text_batch, label_batch, epochs=self.epochs)
        self.tf_model_wrapper = TFModel(model)
        path = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/model/services/'
        model.save(path + 'worker/browse_projects_model/my_model')

if __name__ == '__main__':
    model_trainer = ModelTrainer()
    model_trainer.train()

Mounted at /content/drive
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




# Predict

### Load Model

In [5]:
path = '/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/model/services/'
browse_project_model = tf.keras.models.load_model(path + 'worker/browse_projects_model/my_model')

### read vocab and make predict

In [6]:
vocab = []
with open(r'/content/drive/Shareddrives/Capstone Project/Product-based/ml-stuff/data/project_tags_vocab.txt', 'r') as fp:
  for line in fp:
    x = line[:-1]
    vocab.append(x)

user_input = pd.Series(str(input('Text (S): ')))
predicted_probabilities = browse_project_model(user_input)
for i, text in enumerate(user_input):
    prediction = [x for _, x in sorted(zip(predicted_probabilities[i], vocab),
                                       key=lambda pair: pair[0],
                                       reverse=True)][:3]
    print(prediction)

Text (S): jasa penbuatan landing page
['PHP', 'Web Programming', 'Website']


In [7]:
user_input = pd.Series(str(input('Text (S): ')))
predicted_probabilities = browse_project_model(user_input)
for i, text in enumerate(user_input):
    prediction = sorted(zip(predicted_probabilities[i], vocab),
                        key=lambda pair: pair[0],
                        reverse=True)
    top_1 = prediction[0][1] 
    top_2 = prediction[1][1]
    top_3 = prediction[2][1]
    output = [top_1 + ", " + top_2 + ', ' + top_3]
    print(output)

Text (S): jasa pembuatan landing page
['Web Programming, PHP, Website']
