In [1]:
import numpy as np
import tensorflow as tf

from typing import List, Container
from tensorflow.data import Dataset
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPool1D, Concatenate, Dense, Flatten, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy
from tensorflow.keras.metrics import Mean

class TextCNN(Model):
    def __init__(self, class_num: int):
        super(TextCNN, self).__init__()
        self.class_num: int = class_num
        
    def build(self, input_shape):
        self.conv_1: Conv1D = Conv1D(filters=128, kernel_size=1, activation="relu", name="conv_1")
        self.pool_1: MaxPool1D = MaxPool1D(pool_size=2, strides=1,name="pool_1")
        self.conv_2: Conv1D = Conv1D(filters=128, kernel_size=2, activation="relu", name="conv_2")
        self.pool_2: MaxPool1D = MaxPool1D(pool_size=2, strides=1, name="pool_2")
        self.concatenate: Concatenate = Concatenate(axis=1)
        self.flatten: Flatten = Flatten()
        self.dense: Dense = Dense(self.class_num, activation="softmax")
        super(TextCNN, self).build(input_shape)

    def call(self, inputs: Dataset, training=None, mask=None):
        convs: List[Conv1D] = [self.conv_1(inputs), self.conv_2(inputs)]
        pools: List[MaxPool1D] = [self.pool_1(convs[0]), self.pool_2(convs[1])]
        x = self.concatenate(pools)
        x = self.flatten(x)
        x = self.dense(x)
        return x
    
    def summary(self):
        input: Input = Input(shape=(3, 100), name="Input")
        output = self.call(input)
        model = Model(inputs=input, outputs=output, name="TextCNN")
        model.summary()


In [2]:
from user_profile_prediction.etl.preprocess_train_data import PreprocessTrainingData
from user_profile_prediction.etl.embedding import Embedding
p = PreprocessTrainingData("/Volumes/Samsung_T5/Files/Document/小象学院/GroupProject/project_data/data/train.csv")
p.split_sentence()

e = Embedding(100, 5)
m = e.load_embedding_model()

  df: DataFrame = pd.read_csv(file_path, sep="###__###", header=None)
0it [00:00, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/zl/nw7xtsq52579dn6nkhtf8njh0000gn/T/jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built successfully.
1000it [01:06, 15.01it/s]


In [3]:
train_x, train_y = [], []

for x, y in p.age_data_iter(e):
    train_x.append(x)
    train_y.append(y)

# train_x, train_y = np.array(train_x).astype(np.float32), np.array(train_y)
# train_y = tf.one_hot(train_y, depth=tf.unique(train_y).y.shape[0])

In [8]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(np.stack(train_x, axis=0).reshape(-1,300), np.array(train_y))


In [9]:
from collections import Counter
Counter(y_resampled.tolist())

Counter({1: 60079, 2: 60079, 4: 60079, 3: 60079, 6: 60079, 5: 60079, 0: 60079})

In [4]:
text_cnn = TextCNN(7)

optimizer: Adam = Adam(learning_rate=1e-3)
losses: CategoricalCrossentropy = CategoricalCrossentropy()

train_loss = Mean(name="TrainLoss")

In [5]:
text_cnn.build(input_shape=(None, 3, 100))
text_cnn.summary()



Model: "TextCNN"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              [(None, 3, 100)]     0                                            
__________________________________________________________________________________________________
conv_1 (Conv1D)                 (None, 3, 128)       12928       Input[0][0]                      
__________________________________________________________________________________________________
conv_2 (Conv1D)                 (None, 2, 128)       25728       Input[0][0]                      
__________________________________________________________________________________________________
pool_1 (MaxPooling1D)           (None, 2, 128)       0           conv_1[0][0]                     
____________________________________________________________________________________________

In [6]:
text_cnn(train_x[:2])

<tf.Tensor: shape=(2, 7), dtype=float32, numpy=
array([[0.13064921, 0.14779529, 0.14841455, 0.13912404, 0.14560468,
        0.14522344, 0.14318885],
       [0.08071594, 0.22378945, 0.17926335, 0.18222085, 0.10867348,
        0.09454481, 0.13079214]], dtype=float32)>

In [7]:
def train_step(model, features, labels):
    with tf.GradientTape() as tape:
        prediction = model(features)
        loss = losses(labels, prediction)
    gradient = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradient, model.trainable_variables))
    
    train_loss.update_state(loss)
    
    return gradient

In [8]:
g = train_step(text_cnn, train_x[:10], train_y[:10])


In [11]:
g


[<tf.Tensor: shape=(1, 100, 128), dtype=float32, numpy=
 array([[[ 8.6977112e-04,  2.1640025e-02,  1.0555669e-02, ...,
          -1.7586630e-02,  2.1944260e-02,  3.0472321e-02],
         [ 6.9280512e-05, -1.0500901e-02, -1.0705659e-03, ...,
           6.6391178e-03, -2.2276356e-03, -4.5966431e-03],
         [-7.0063793e-04, -2.2082136e-03, -1.9682543e-03, ...,
           1.2968357e-03, -2.6399344e-03, -5.4369904e-03],
         ...,
         [-4.7491817e-03, -2.7957445e-02, -1.4121589e-02, ...,
           2.2757819e-02, -2.8696684e-02, -4.1018646e-02],
         [ 5.6875343e-03,  3.7241682e-02,  1.6573045e-02, ...,
          -2.9230673e-02,  3.4768738e-02,  4.9513269e-02],
         [ 1.6227366e-03,  1.2945966e-02,  5.5428855e-03, ...,
          -9.6534993e-03,  1.0797372e-02,  1.6256157e-02]]], dtype=float32)>,
 <tf.Tensor: shape=(128,), dtype=float32, numpy=
 array([-1.68123525e-02, -8.24588239e-02, -3.44683565e-02, -1.09222680e-01,
         5.27538359e-03,  2.52067461e-03,  1.57354195e

In [55]:
text_cnn.compile(optimizer=optimizer, loss=losses)
history = text_cnn.fit(train_x, train_y, epochs=5, batch_size=100)


In [15]:
tf.one_hot(train_y, depth=tf.unique(train_y).y.shape[0])

<tf.Tensor: shape=(147726, 7), dtype=float32, numpy=
array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)>

In [19]:
from sklearn.model_selection import train_test_split

print(train_x.shape)
print(train_y.numpy().shape)

ds = tf.data.Dataset.from_tensor_slices((train_x, train_y.numpy()))

(147726, 3, 100)
(147726, 7)


In [36]:
tf.split(
    train_y.numpy(), 
    num_or_size_splits=[4, 147722],
    # num=4, 
    axis=0
)


[<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
 array([[0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>,
 <tf.Tensor: shape=(147722, 7), dtype=float32, numpy=
 array([[0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)>]

In [12]:
a = tf.constant([[1, 2], [3, 4]])
b = tf.constant([[3,4], [2,6]])

In [16]:
train_y[2: 10]



<tf.Tensor: shape=(8, 7), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>