In [None]:
!pip install bert-for-tf2

Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[?25l[K     |████████                        | 10 kB 29.9 MB/s eta 0:00:01[K     |████████████████                | 20 kB 16.6 MB/s eta 0:00:01[K     |███████████████████████▉        | 30 kB 13.2 MB/s eta 0:00:01[K     |███████████████████████████████▉| 40 kB 12.8 MB/s eta 0:00:01[K     |████████████████████████████████| 41 kB 184 kB/s 
[?25hCollecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30535 sha256=1fd83378bec880d903070efa6580ae1bd20ed48510b51d16479af79d1ef185c2
  Stored in directory: /root/.cache/pip/wheels/47/b6/e5/8c76ec779f54bc5c2f1b57d2200bb9c77616da83873e8acb53
  Buil

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir("/content/drive/MyDrive/app") 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import seaborn as sns
from matplotlib import pyplot as plt
from pandas import DataFrame
from tensorflow.keras import layers
import bert
import pandas as pd
import numpy as np
import re
import random
import math
import os
import joblib

# INFJ/ESTP

class TEXT_MODEL(tf.keras.Model):

    def __init__(self,
           vocabulary_size,
           embedding_dimensions=128,
           cnn_filters=50,
           dnn_units=512,
           model_output_classes=2,
           dropout_rate=0.1,
           training=False,
           name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)

        self.embedding = layers.Embedding(vocabulary_size,
                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                         kernel_size=2,
                         padding="valid",
                         activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                         kernel_size=3,
                         padding="valid",
                         activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                         kernel_size=4,
                         padding="valid",
                         activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                             activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                             activation="softmax")

    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l)
        l_1 = self.pool(l_1)
        l_2 = self.cnn_layer2(l)
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3)

        concatenated = tf.concat([l_1, l_2, l_3], axis=-1)  # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)

        return model_output


def tokenize_text(text_input):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_input))


if __name__ == '__main__':

    # hyper parameters
    BATCH_SIZE = 128
    EMB_DIM = 300
    CNN_FILTERS = 100
    DNN_UNITS = 256
    OUTPUT_CLASSES = 10
    DROPOUT_RATE = 0.5
    NB_EPOCHS = 20
    max_len = 2000

    # raw data

    data_set = pd.read_csv("mbti.csv")
    y_4axis = [[], [], [], []]
    text = []
    personality_type = ['IE', 'NS', 'FT', 'JP']
    for _i in range(len(data_set)):
        _text = data_set["posts"][_i]
        _text = _text[1:-1]
        _text = re.sub(r'https?:\/\/.*?[\s+]', ' ', _text)
        _text = re.sub(r'http?:\/\/.*?[\s+]', ' ', _text)
        _text = _text.replace('...|||', ' ')
        _text = _text.replace('|||', ' ')
        text.append(_text)
        for _ in range(4):
          y_4axis[_].append(0 if data_set["type"][_i][_] == personality_type[_][0] else 1)

    # Creating a BERT Tokenizer
    BertTokenizer = bert.bert_tokenization.FullTokenizer

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
    vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

    # Tokenize all the text
    tokenized_text = [tokenize_text(i) for i in text]

    for _i in range(4):
      # Prerparing Data For Training
      text_with_len = [[text, y_4axis[_i][i], len(text)]
                for i, text in enumerate(tokenized_text)]
      random.shuffle(text_with_len)
      # text_with_len.sort(key=lambda x: x[2])
      # sorted_text_labels = [(text_lab[0], text_lab[1]) for text_lab in text_with_len]
      sorted_text_labels = [(text_lab[0][:max_len], text_lab[1]) for text_lab in text_with_len]
      processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_text_labels, output_types=(tf.int32, tf.int32))
      # batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
      batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((max_len,), ()))

      TOTAL_BATCHES = math.ceil(len(sorted_text_labels) / BATCH_SIZE)
      TEST_BATCHES = TOTAL_BATCHES // 20
      batched_dataset.shuffle(TOTAL_BATCHES)
      test_data = batched_dataset.take(TEST_BATCHES)
      train_data = batched_dataset.skip(TEST_BATCHES)

      VOCAB_LENGTH = len(tokenizer.vocab)
      text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                   embedding_dimensions=EMB_DIM,
                   cnn_filters=CNN_FILTERS,
                   dnn_units=DNN_UNITS,
                   model_output_classes=OUTPUT_CLASSES,
                   dropout_rate=DROPOUT_RATE)

      if OUTPUT_CLASSES == 2:
          text_model.compile(loss="binary_crossentropy",
                    optimizer="adam",
                    metrics=["accuracy"])
      else:
          text_model.compile(loss="sparse_categorical_crossentropy",
                    optimizer="adam",
                    metrics=["sparse_categorical_accuracy"])

      text_model.fit(train_data, epochs=NB_EPOCHS)
      # text_model.fit(train_data, epochs=NB_EPOCHS,validation_data=test_data)
      # test test data
      results = text_model.evaluate(test_data)
      print(f'{personality_type[_i][0]}/{personality_type[_i][1]} Trained Successfully!\n Accuracy: {results[1] * 100}%')

      joblib.dump(text_model, f'{personality_type[_i][0]}{personality_type[_i][1]}.pkl')
      print(f'model {personality_type[_i][0]}/{personality_type[_i][1]} saved.')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
I/E Trained Successfully!
 Accuracy: 89.58333134651184%
INFO:tensorflow:Assets written to: ram://daad526a-dff3-4930-a673-f2a9f68423b0/assets


INFO:tensorflow:Assets written to: ram://daad526a-dff3-4930-a673-f2a9f68423b0/assets


model I/E saved.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
N/S Trained Successfully!
 Accuracy: 91.66666865348816%
INFO:tensorflow:Assets written to: ram://b5e1ba11-bb99-4588-b66d-1dcda08dae3e/assets


INFO:tensorflow:Assets written to: ram://b5e1ba11-bb99-4588-b66d-1dcda08dae3e/assets


model N/S saved.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F/T Trained Successfully!
 Accuracy: 90.36458134651184%
INFO:tensorflow:Assets written to: ram://e2279a2f-efe8-41fe-8e73-b6592bd6e7ea/assets


INFO:tensorflow:Assets written to: ram://e2279a2f-efe8-41fe-8e73-b6592bd6e7ea/assets


model F/T saved.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




J/P Trained Successfully!
 Accuracy: 82.29166865348816%
INFO:tensorflow:Assets written to: ram://1710afe2-d780-4712-86d9-c001d6c0d4bf/assets


INFO:tensorflow:Assets written to: ram://1710afe2-d780-4712-86d9-c001d6c0d4bf/assets


model J/P saved.


# 新段落