<a href="https://colab.research.google.com/github/JasonDepblu/potential-spork/blob/new_embeddings_model/new_embeddings_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 导入模块及库

In [7]:
pip install transformers



In [8]:
pip install tqdm



In [1]:
# 导入模块
import os
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras import layers, models, optimizers
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, auc
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
import copy
import scipy
from collections import UserList
from tabulate import tabulate
from tensorflow.keras.callbacks import ModelCheckpoint
import re
from transformers import BertTokenizer, TFBertModel
from transformers import AdamW
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# 导入数据及预处理

In [2]:
#%% # Load the dataset
file_path_car = '/content/car_dict_e8_emb.csv'
car_data = pd.read_csv(file_path_car)

file_path_user = '/content/dsc_up_e8.csv'
user_data = pd.read_csv(file_path_user)

# Display the first few rows of the dataframe
car_data.head()
user_data.head()

"""# `预处理data`"""

# Remove columns with only one unique value
columns_to_drop = car_data.columns[car_data.nunique() <= 1]
car_data_reduced = car_data.drop(columns=columns_to_drop)

car_data_reduced.head(10)

# Define the new function to handle multiple ● symbols
def replace_with_detailed_feature_description(cell, feature_name):
    if pd.isna(cell):
        return cell  # Keep NaN as is
    elif isinstance(cell, str):
        # Replace specific symbols with words
        cell = cell.replace('●', '有').replace('○', '选配').replace('-', '不具有').replace('NULL', '未知')
        # Find all occurrences of "有" which indicates a feature is present
        features = cell.split()
        feature_descriptions = []
        for feature in features:
            if feature.startswith('有'):
                feature_description = feature[1:]  # Remove the '有' prefix
                feature_descriptions.append(feature_description)
        if feature_descriptions:
            return f"具备{feature_name}功能，" + "、".join(feature_descriptions)
        else:
            # If there are no '有', but the cell is not empty or NaN, it means
            # it's either '选配', '不具有' or '未知'
            return cell + feature_name + "功能"
    else:
        return cell  # Keep the original value if it doesn't match the above

# Apply the new function to each non-numeric cell
for column in car_data_reduced.columns:
    if car_data_reduced[column].dtype == 'object':  # Apply only to non-numeric columns
        feature_name = column.strip()  # Clean up the column name if necessary
        car_data_reduced[column] = car_data_reduced[column].apply(lambda cell:
                               replace_with_detailed_feature_description(cell, feature_name))

car_data_reduced.head(10)

# Define the path for the cleaned CSV file
cleaned_file_path = '/content/cleaned_car_data.csv'

# Save the cleaned dataframe to a CSV file
car_data_reduced.to_csv(cleaned_file_path, index=False)  # Set index to False to avoid saving the index



dsc_up = user_data.values
car_dict = car_data_reduced.values

print(dsc_up.shape)
print(car_dict.shape)

# column of last feature from 0 and +1
col_last_feature_car = 238
# 80
col_last_num_feature_car = 38

col_last_feature_user = 9

col_last_num_feature_user = 5

# 提取特征列表
dsc_up_features = user_data.columns.tolist()
car_dict_features = car_data_reduced.columns.tolist()

dsc_up_features = user_data.columns[1:col_last_feature_user].tolist() # 提取第1至28个特征
car_dict_features = car_data_reduced.columns[1:col_last_feature_car].tolist() # 提取第1至155个特征
car_dict_num_features = car_data_reduced.columns[1:col_last_num_feature_car].tolist() # 提取第151至155个特征
dsc_up_num_features = user_data.columns[1:col_last_num_feature_user].tolist()
#

print(dsc_up_features)
print(car_dict_features)
print(dsc_up_features[-1])
print(car_dict_features[-1])
print(car_dict_num_features[-1])
print(dsc_up_num_features[-1])

num_cars = len(car_dict)
print(f'The No. of cars in fundamental model dataset : {num_cars}')

# balance dsc_up samples
dsc_up_bal = np.empty(shape=(1, dsc_up.shape[1]))

for i in range(num_cars):
  if len(dsc_up[user_data.no_in_car_dict == i]) > 300:
    ran_row_bal = np.random.choice(np.arange(len(dsc_up[user_data.no_in_car_dict == i])), size=300, replace=False)
    dsc_up_bali = dsc_up[user_data.no_in_car_dict == i][ran_row_bal]
  else:
    dsc_up_bali = dsc_up[user_data.no_in_car_dict == i]
  dsc_up_bal = np.concatenate((dsc_up_bal, dsc_up_bali), axis=0)
dsc_up = dsc_up_bal[1:]
print(dsc_up.shape)

# load the label
print(f'The shape of cars dict of the fundamental model dataset : {car_dict.shape}')
print(f'The shape of users list of the fundamental model dataset : {dsc_up.shape}')

ys_up = dsc_up[:, 12]
print(f'The label of which user choosed which car in fundamental dataset : {ys_up}')

# transfer the label of fundamental dataset to onehot_code
ys = copy.copy(ys_up.reshape(len(ys_up), 1))
ys -= 1
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(ys)
ys = onehot_encoded
print(f'The shape of the label of fundamental dataset to onehot_code : {ys.shape}')

car_unscaled = car_dict[:, 1:col_last_feature_car]
user_unscaled = dsc_up[:, 1:col_last_feature_user]


# generate the gaussian random no. to budget feature
mu_budget = 0.5
sigma_budget = 2

for i in range(len(user_unscaled)):
    user_unscaled[i, 4] += random.gauss(mu_budget, sigma_budget)

car = car_unscaled.copy()
user = user_unscaled.copy()

# scaling the data
scalerCar = StandardScaler()
scalerCar.fit(car_unscaled[:, 0:col_last_num_feature_car-1])
car_nume = scalerCar.transform(car_unscaled[:, 0:col_last_num_feature_car-1])
car[:, 0:col_last_num_feature_car-1] = car_nume

scalerUser = StandardScaler()
scalerUser.fit(user_unscaled[:, (col_last_num_feature_user-1):(col_last_feature_user-1)])
user_nume = scalerUser.transform(user_unscaled[:, (col_last_num_feature_user-1):(col_last_feature_user-1)])
user[:, col_last_num_feature_user-1:col_last_feature_user-1] = user_nume

print(np.allclose(car_unscaled[:, 0:col_last_num_feature_car-1].astype(float), scalerCar.inverse_transform(car_nume)))
print(np.allclose(user_unscaled[:, (col_last_num_feature_user-1):(col_last_feature_user-1)].astype(float), scalerUser.inverse_transform(user_nume)))


  car_data = pd.read_csv(file_path_car)


(5520, 14)
(142, 239)
['comments', 'purposes', 'province', 'city', 'budget', 'drivenkiloms', 'driving _months', 'km/d']
['price', 'quick_charge_time(80%)', 'slow_charge_time', 'power_car', '最大净功率', 'torque_car', 'gears', 'max_speed', 'time_for_100km_h', 'pure_electric_endurance_mileage', 'endurance_mileage', '电动机总功率', '电动机总扭矩', '前电动机最大功率', '前电动机最大扭矩', '后电动机最大功率', '后电动机最大扭矩', '驱动电机数', 'length', 'width', 'height', 'wheel_base', 'front_gauge', 'track_rear', 'curb_weight', 'seats', '扬声器数量', '中控屏幕尺寸', '后备厢容积min', '后备厢容积max', '超声波雷达数量', '毫米波雷达数量', '快充功率', '怠速车内噪声', '速度60车内噪声', '速度80车内噪声', '速度120车内噪声', '厂商', '级别', '能源类型', '发动机', '变速箱', '整车质保', '电机类型', '驱动电机数.1', '电机布局', '电池类型', '电池组质保', '变速箱类型', '驱动方式', '后悬架类型', '后制动器类型', '驻车制动类型', '前轮胎规格', '后轮胎规格', '备胎规格', '主_副驾驶座安全气囊', '前_后排侧气囊', '前_后排头部气囊', '膝部气囊', '胎压监测功能', '安全带未系提醒', 'ISOFIX儿童座椅接口', 'ABS防抱死', '制动力分配', '刹车辅助', '牵引力控制', '车身稳定控制', '并线辅助', '车道偏离预警系统', '车道保持辅助系统', '道路交通标识识别', '主动刹车_主动安全系统', '疲劳驾驶提示', '前_后驻车雷达', '驾驶辅助影像', '倒车车侧预警系统', '巡航系统', '

In [3]:
# 样本匹配
# func for generating num_items users of different cars
def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict matrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (1, num_items))
    return user_vecs

def gen_car_vecs(car_vec, num_users):
    car_vecs = np.tile(car_vec, (num_users, 1))
    return car_vecs

# generate num_items users for different cars
user_vecs = gen_user_vecs(user, len(car_dict))
user_vecs = user_vecs.reshape(-1, user.shape[1])
car_vecs = gen_car_vecs(car, len(dsc_up))
ys = ys.reshape(-1, 1)
print(f'The shape of user matrix in fundamental dataset : {user_vecs.shape}')
print(f'The shape of car matrix in fundamental dataset : {car_vecs.shape}')
print(f'The shape of label matrix in fundamental dataset : {ys.shape}')

print(ys[ys == 0].shape)

randnum = random.randint(0, 100)
print(randnum)
random.seed(randnum)

# case 3
# Random choose the users which label is 0 in fundamental dataset

user_vecs_one = user_vecs[(ys == 1)[:, 0]]
user_vecs_zero = user_vecs[(ys == 0)[:, 0]]
num_users = user_vecs_one.shape[0]
num_zeros = user_vecs_zero.shape[0]
random.seed(randnum)

sample_row = np.random.choice(np.arange(num_zeros), size=np.ceil(num_users * 1.2).astype(int), replace=False)
user_vecs_zero = user_vecs_zero[sample_row]
user_vecs = np.concatenate((user_vecs_zero, user_vecs_one), axis=0)

# random.seed(44)
shuffle_row = np.random.choice(np.arange(0,len(user_vecs)), len(user_vecs), replace=False)
user_vecs = user_vecs[shuffle_row]
print(f'The shape of the users data in fundatmental dataset after balancing : {user_vecs.shape}')

# case3
# Random choose the cars which label is 0 in fundamental dataset

car_vecs_one = car_vecs[(ys == 1)[:, 0]]
car_vecs_zero = car_vecs[(ys == 0)[:, 0]][sample_row]
car_vecs = np.concatenate((car_vecs_zero, car_vecs_one), axis=0)

car_vecs = car_vecs[shuffle_row]
print(f'The shape of the cars data in fundatmental dataset after balancing : {car_vecs.shape}')

# case 3
# Random choose the label which is 0 in fundamental dataset
ys_one = ys[(ys == 1)[:, 0]]
ys_zero = ys[(ys == 0)[:, 0]][sample_row]
ys = np.concatenate((ys_zero, ys_one), axis=0)

ys = ys[shuffle_row]
print(f'The shape of label data in fundatmental dataset after balancing : {ys.shape}')

user_vecs_nnum = user_vecs[:, 0:col_last_num_feature_user-1]
user_vecs_num = user_vecs[:, (col_last_num_feature_user-1):(col_last_feature_user-1)]

car_vecs_nnum = car_vecs[:, col_last_num_feature_car-1:col_last_feature_car-1]
car_vecs_num = car_vecs[:, 0:col_last_num_feature_car-1]

print(user_vecs_nnum.shape)
print(user_vecs_num.shape)
print(car_vecs_nnum.shape)
print(car_vecs_num.shape)

The shape of user matrix in fundamental dataset : (702190, 8)
The shape of car matrix in fundamental dataset : (702190, 237)
The shape of label matrix in fundamental dataset : (702190, 1)
(697245,)
34
The shape of the users data in fundatmental dataset after balancing : (10879, 8)
The shape of the cars data in fundatmental dataset after balancing : (10879, 237)
The shape of label data in fundatmental dataset after balancing : (10879, 1)
(10879, 4)
(10879, 4)
(10879, 200)
(10879, 37)


In [7]:
# 初始化分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model =  TFBertModel.from_pretrained('bert-base-chinese')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:
#%%
# preprocessing the text
def text_clean(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Convert non-string text to a string (if it's NaN, it becomes the string 'nan')
    text = str(text) if text is not None else ""

    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', '[SEP]', text).strip()
    # text =  "[CLS] " + text + "[SEP]"

    return text


#%%
def text_join(ar_features):
    features = []
    for r in range(ar_features.shape[0]):
        feature_row = []
        for c in range(ar_features.shape[1]):
            # Convert the element to string and append to the feature_row list
            feature_row.append(str(ar_features[r, c]))
        # Join the string representations of the features using a separator
        features.append(' '.join(feature_row))

    # Convert the list of strings to a NumPy array
    features = np.array(features, dtype=object)
    return features

In [5]:
user_nnum = text_join(user_vecs_nnum)
print(f'joined features of users shape is {user_nnum.shape}')

car_nnum = text_join(car_vecs_nnum)
print(f'joined features of cars shape is {user_nnum.shape}')

joined features of users shape is (10879,)
joined features of cars shape is (10879,)


In [10]:
## test join the features as inputs(features)
def embeddings_for_features(features, max_length_comments, batch_size_user=4):
    features_emb_list = []
    # Iterate over each user's features
    for batch_start_index in tqdm(range(0, len(features), batch_size_user), desc="Processing users"):
        batch_features = features[batch_start_index:batch_start_index+batch_size_user]
        batch_input_ids = []
        batch_attention_masks = []
        for feature_text in batch_features:
            clean_text = text_clean(feature_text if feature_text else "")
            # Tokenize and encode the text
            tokenized_inputs = tokenizer(
                text=clean_text,
                add_special_tokens=True,
                max_length=max_length_comments,
                padding='max_length',
                return_tensors="tf",
                truncation=True
            )
            batch_input_ids.append(tokenized_inputs["input_ids"][0])
            batch_attention_masks.append(tokenized_inputs["attention_mask"][0])
        batch_input_ids = tf.stack(batch_input_ids, axis=0)
        batch_attention_masks = tf.stack(batch_attention_masks, axis=0)
        outputs = model(batch_input_ids, attention_mask=batch_attention_masks)
        cls_embeddings = outputs.last_hidden_state.numpy()
        # Store the results in the list
        for user_features_embeddings in cls_embeddings:
            features_emb_list.append(user_features_embeddings)
            # Convert the list of embeddings to a tensor
    # features_emb_tensor = tf.convert_to_tensor(features_emb_list, dtype=tf.float32)
    return features_emb_list


In [11]:
def batch_convert_to_tensor(data_list, batch_size):
    """将大型列表分批次转换为Tensor"""
    for i in tqdm(range(0, len(data_list), batch_size), desc="Converting the list to tensor"):
        yield tf.convert_to_tensor(data_list[i:i + batch_size], dtype=tf.float32)

# 定义一个生成器函数来逐个加载.npy文件
def load_batches(directory, file_pattern, num_batches):
    for i in range(num_batches):
        file_path = os.path.join(directory, file_pattern+f'{i}')
        batch_data = np.load(file_path)
        yield batch_data

In [6]:
# 对所有数据进行拆分以获取训练集和测试集
(user_train_nnum, user_test_nnum,
 user_train_num, user_test_num,
 car_train_nnum, car_test_nnum,
 car_train_num, car_test_num,
 ys_train, ys_test) = train_test_split(user_nnum, user_vecs_num, car_nnum, car_vecs_num, ys, train_size=0.8, shuffle=True, stratify=ys, random_state=2023)

# 对测试集进行进一步拆分以获得验证集和最终的测试集
(user_val_nnum, user_test_nnum,
 user_val_num, user_test_num,
 car_val_nnum, car_test_nnum,
 car_val_num, car_test_num,
 ys_val, ys_test) = train_test_split(user_test_nnum, user_test_num, car_test_nnum, car_test_num, ys_test, train_size=0.5, shuffle=True, stratify=ys_test, random_state=2023)

# 分割文本为适应模型最大长度的多个部分
max_length_comments = 2048  # 模型的最大长度限制
max_length = 2700
# stride = 512  # 可以重叠的token数量
batch_size_user = 4
batch_size = 4

In [7]:
#%% # transfer the numpy features to tensors
user_train_num_tensor = tf.convert_to_tensor(user_train_num, dtype=tf.float32)
car_train_num_tensor = tf.convert_to_tensor(car_train_num, dtype=tf.float32)
ys_train_tensor = tf.convert_to_tensor(ys_train, dtype=tf.float32)

user_val_num_tensor = tf.convert_to_tensor(user_val_num, dtype=tf.float32)
car_val_num_tensor = tf.convert_to_tensor(car_val_num, dtype=tf.float32)
ys_val_tensor = tf.convert_to_tensor(ys_val, dtype=tf.float32)

#%%
print(user_train_num_tensor.shape)
print(car_train_num_tensor.shape)
print(ys_train_tensor.shape)

print(user_val_num_tensor.shape)
print(car_val_num_tensor.shape)
print(ys_val_tensor.shape)

(8703, 4)
(8703, 37)
(8703, 1)
(1088, 4)
(1088, 37)
(1088, 1)


In [14]:
#%% case 2 分别不同的特征单独保存
def serialize_example(feature, feature_name, chunk_size=1024):
    # 分割特征为多个块
    chunks = [feature[i:i + chunk_size] for i in range(0, len(feature), chunk_size)]
    features = {
        f"{feature_name}_chunk_{i}": tf.train.Feature(
            float_list=tf.train.FloatList(value=chunk.flatten())
        )
        for i, chunk in enumerate(chunks)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=features))
    return example_proto.SerializeToString()

def save_features_to_tfrecord(emb_list, directory, file_pattern, feature_name, chunk_size=1024):
    tfrecord_file = os.path.join(directory, f"{file_pattern}.tfrecord")
    with tf.io.TFRecordWriter(tfrecord_file) as writer:
        for feature in tqdm(emb_list, desc="Saving features to TFRecord"):
            serialized_feature = serialize_example(feature, feature_name, chunk_size)
            writer.write(serialized_feature)


In [8]:
directory = '/content/'

In [16]:
user_train_emb_list = embeddings_for_features(user_train_nnum, max_length_comments=2048, batch_size_user=8)


Processing users: 100%|██████████| 1088/1088 [08:51<00:00,  2.05it/s]


In [17]:
# save_features_to_tfrecord(user_train_emb_list, directory, file_pattern='user_nn_train_feature')
save_features_to_tfrecord(user_train_emb_list,
                          directory,
                          file_pattern='user_nn_train_feature',
                          feature_name='user_nn_feature',
                          chunk_size=256
                          )



Saving features to TFRecord: 100%|██████████| 8703/8703 [49:04<00:00,  2.96it/s]


In [19]:
user_train_emb_list = []

In [20]:
car_train_emb_list = embeddings_for_features(car_train_nnum, max_length_comments=2700, batch_size_user=4)

Processing users: 100%|██████████| 2176/2176 [14:43<00:00,  2.46it/s]


In [21]:
save_features_to_tfrecord(car_train_emb_list,
                          directory,
                          file_pattern='car_nn_train_feature',
                          feature_name='car_nn_feature',
                          chunk_size=256
                          )

Saving features to TFRecord: 100%|██████████| 8703/8703 [1:04:54<00:00,  2.23it/s]


In [22]:
car_train_emb_list = []

# **user_val_emb_list & save to tfrecord ** **bold text**

In [15]:
user_val_emb_list = embeddings_for_features(user_val_nnum, max_length_comments=2048, batch_size_user=4)

Processing users: 100%|██████████| 272/272 [01:09<00:00,  3.90it/s]


In [16]:
# %% case 2
save_features_to_tfrecord(user_val_emb_list,
                          directory,
                          file_pattern='user_nn_val_feature',
                          feature_name='user_nn_feature',
                          chunk_size=256
                          )



Saving features to TFRecord: 100%|██████████| 1088/1088 [05:59<00:00,  3.02it/s]


In [34]:
user_val_emb_list = []

# **car_val_emb_list & save to tfrecord**

In [17]:
car_val_emb_list = embeddings_for_features(car_val_nnum, max_length, batch_size_user=4)


Processing users: 100%|██████████| 272/272 [01:49<00:00,  2.48it/s]


In [18]:
save_features_to_tfrecord(car_val_emb_list,
                          directory,
                          file_pattern='car_nn_val_feature',
                          feature_name='car_nn_feature',
                          chunk_size=256
                          )

Saving features to TFRecord: 100%|██████████| 1088/1088 [07:56<00:00,  2.28it/s]


In [33]:
car_val_emb_list = []

# **tfrecord解释及pipeline**

In [9]:
def parse_nn_features(example_proto, feature_name='user_nn_feature', num_chunks=256, nnshape=None):
    feature_description = {
        f"{feature_name}_chunk_{i}": tf.io.VarLenFeature(dtype=tf.float32)
        for i in range(num_chunks)
    }
    # 解析样本
    example = tf.io.parse_single_example(example_proto, feature_description)
    # 重新组合特征块
    feature_chunks = [tf.sparse.to_dense(example[f"{feature_name}_chunk_{i}"])
                      for i in range(num_chunks)]
    feature = tf.concat(feature_chunks, axis=0)
    # 根据需要重设形状
    feature = tf.reshape(feature, nnshape)
    return feature

# 解析其他 tensor 的函数
def parse_tensor_features(user_num_tensor, car_num_tensor, ys_train_tensor):
    return user_num_tensor, car_num_tensor, ys_train_tensor


In [10]:
#%% user_nn_val_dataset读取及解析
# 创建数据集
tfrecord_file = '/content/' \
                'user_nn_train_feature.tfrecord'
feature_name = 'user_nn_feature'
num_chunks = 256  # 您需要计算或记录这个值，它应该与序列化时分割的块数相匹配
nnshape=[2048, 768]

user_nn_train_dataset = tf.data.TFRecordDataset(tfrecord_file)
# 应用解析函数
user_nn_train_dataset = user_nn_train_dataset.map(lambda x: parse_nn_features(x, feature_name, num_chunks, nnshape))


#%%#%% car_nn_val_dataset读取及解析
# 创建数据集
tfrecord_file = '/content/' \
                'car_nn_train_feature.tfrecord'
feature_name = 'car_nn_feature'
num_chunks = 256  # 您需要计算或记录这个值，它应该与序列化时分割的块数相匹配
nnshape=[2700, 768]

car_nn_train_dataset = tf.data.TFRecordDataset(tfrecord_file)
# 应用解析函数
car_nn_train_dataset = car_nn_train_dataset.map(lambda x: parse_nn_features(x, feature_name, num_chunks, nnshape))


In [37]:
ys_train_tensor.shape

TensorShape([8703, 1])

In [38]:
user_train_num_tensor.shape

TensorShape([8703, 4])

In [11]:
other_tensors_train_dataset = tf.data.Dataset.from_tensor_slices((user_train_num_tensor, car_train_num_tensor, ys_train_tensor))
other_tensors_train_dataset = other_tensors_train_dataset.map(parse_tensor_features)

In [12]:
dataset = tf.data.Dataset.zip((user_nn_train_dataset, car_nn_train_dataset, other_tensors_train_dataset))
dataset = dataset.map(lambda user_feature, car_feature, other:((user_feature, car_feature) + other[:-1], other[-1]))


In [13]:
batch_size_data = 4  # 例如，根据您的内存情况调整这个值
dataset = dataset.batch(batch_size_data)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)  # 使用 AUTOTUNE 来让 TensorFlow 决定最优的预取数量
dataset = dataset.cache()  # 缓存数据以供快速访问


# **building the model**

In [14]:
"""# building model"""
from tensorflow.keras.layers import Input, MultiHeadAttention, Dense, Concatenate, Flatten
from tensorflow.keras import Model

In [15]:
class MyCustomModel(Model):
    def __init__(self, num_heads, key_dim, ffn_units, dnn_units, **kwargs):
        super(MyCustomModel, self).__init__(**kwargs)

        # Multi-Head Attention layers
        self.multi_head_1 = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
        self.multi_head_2 = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)

        # Front Feed layers
        self.ffn_1 = Dense(ffn_units, activation='relu')
        self.ffn_2 = Dense(ffn_units, activation='relu')

        # DNN layers
        self.dnn = [Dense(unit, activation='relu') for unit in dnn_units]

        # Output layer
        self.output_layer = Dense(1, activation='sigmoid')

    def call(self, inputs):
        text_feature1, text_feature2, numerical_feature1, numerical_feature2 = inputs

        # Attention for each text feature
        attn_out1 = self.multi_head_1(text_feature1, text_feature1, text_feature1)
        attn_out2 = self.multi_head_2(text_feature2, text_feature2, text_feature2)

        # Front Feed Network
        ffn_out1 = self.ffn_1(attn_out1)
        ffn_out2 = self.ffn_2(attn_out2)

        # Flatten features for concatenation
        flat_1 = Flatten()(ffn_out1)
        flat_2 = Flatten()(ffn_out2)

        # Concatenate all features
        concatenated = Concatenate()([flat_1, flat_2, numerical_feature1, numerical_feature2])

        # Pass through DNN
        x = concatenated
        for dnn_layer in self.dnn:
            x = dnn_layer(x)

        # Output
        return self.output_layer(x)

In [1]:
# Define the model parameters
NUM_HEADS = 2
KEY_DIM = 256
FFN_UNITS = 256
DNN_UNITS = [512, 256, 128]

# Inputs shapes
text_feature1_shape = (None, 2048, 768)
text_feature2_shape = (None, 2700, 768)
numerical_feature1_shape = (None, 4)
numerical_feature2_shape = (None, 37)

# 定义输入
text_feature1_input = Input(shape=(2048, 768))
text_feature2_input = Input(shape=(2700, 768))  # Adjusted to match text_feature1 length
numerical_feature1_input = Input(shape=(4,))
numerical_feature2_input = Input(shape=(37,))


# 创建模型实例
my_custom_model = MyCustomModel(num_heads=NUM_HEADS, key_dim=KEY_DIM, ffn_units=FFN_UNITS, dnn_units=DNN_UNITS)

# 通过模型传递输入，构建函数式模型
outputs = my_custom_model([text_feature1_input, text_feature2_input, numerical_feature1_input, numerical_feature2_input])
model = Model(inputs=[text_feature1_input, text_feature2_input, numerical_feature1_input, numerical_feature2_input], outputs=outputs)

opt = keras.optimizers.Adam(learning_rate=0.08)

# 编译模型
model.compile(optimizer=opt, loss='mean_squared_error')

# 模型摘要
model.summary()

NameError: ignored

In [None]:
# 超参
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=60, # number of epochs to wait
                                                  restore_best_weights=True)

reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', # monitor the validation loss
    factor=0.5,         # reduce the learning rate to 10% of its current value
    patience=30,         # reduce the learning rate if the metric does not improve for 5 consecutive epochs
    min_lr=1e-7,        # set the minimum learning rate
    verbose=1           # print messages about learning rate reduction
)

# 创建ModelCheckpoint回调
checkpoint_filepath = '/content/best_model2.h5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1,
    save_weights_only=False # 保存整个模型。如果只想保存权重，设置为True。
)


In [None]:
history = model.fit(
        dataset,  # 提供训练数据和标签
        epochs=3,  # 总的迭代周期

        verbose=1  # 打印详细训练日志
)



Epoch 1/3
   1372/Unknown - 298s 210ms/step - loss: 0.4472