In [1]:
!pip install transformers 

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# print fine-tuning progress
from transformers import TrainerCallback

class ProgressCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} ended. Loss: {state.log_history[-1]['loss']:.4f}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            print(f"Step {state.global_step}: Loss: {logs['loss']:.4f}")

In [10]:
import pandas as pd

# 加载数据
similarity_data = pd.read_csv('product_similarity.csv', index_col=0)
amazon_data = pd.read_csv('amazon.csv')

# 转换相似度矩阵为三元组形式
similarity_triplets = []
for i in similarity_data.index:
    for j in similarity_data.columns:
        similarity_triplets.append((i, j, similarity_data.at[i, j]))

similarity_triplets = pd.DataFrame(similarity_triplets, columns=['product_id_1', 'product_id_2', 'similarity'])

In [11]:
# 尝试将 similarity 列转换为浮点数，无法转换的将被过滤掉
similarity_triplets = similarity_triplets[pd.to_numeric(similarity_triplets['similarity'], errors='coerce').notnull()]

# 将所有值转换为适当的类型
similarity_triplets['product_id_1'] = similarity_triplets['product_id_1'].astype(str)
similarity_triplets['product_id_2'] = similarity_triplets['product_id_2'].astype(str)
similarity_triplets['similarity'] = similarity_triplets['similarity'].astype(float)

In [12]:
# 合并数据
merged_data = similarity_triplets.merge(amazon_data, left_on='product_id_1', right_on='product_id', how='left')
merged_data = merged_data.drop(columns=['product_id'])
merged_data = merged_data.merge(amazon_data, left_on='product_id_2', right_on='product_id', how='left', suffixes=('_1', '_2'))

# 选择需要的列
merged_data = merged_data[['product_name_1', 'about_product_1', 'product_name_2', 'about_product_2', 'similarity']]

In [31]:
from datasets import Dataset

# 确保数据类型正确后再转换为Dataset对象
dataset = Dataset.from_pandas(merged_data)

from transformers import AutoTokenizer

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("tokenizer.json")
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("stablelm-2-zephyr-1_6b", token="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")

# 定义预处理函数
def preprocess_function(examples):
    texts = [f"Product 1: {name1} {desc1} Product 2: {name2} {desc2}" for name1, desc1, name2, desc2 in zip(examples['product_name_1'], examples['about_product_1'], examples['product_name_2'], examples['about_product_2'])]
    inputs = tokenizer(texts, max_length=512, truncation=True, padding='max_length')
    inputs["labels"] = examples["similarity"]
    return inputs

# 应用预处理函数
encoded_dataset = dataset.map(preprocess_function, batched=True)

# 划分训练和验证集
encoded_dataset = encoded_dataset.train_test_split(test_size=0.1)
train_dataset = encoded_dataset['train']
eval_dataset = encoded_dataset['test']

OSError: tokenizer.json is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [26]:
import os

# 列出当前工作目录内容
print(os.listdir('/home/uc3f57f00092eb0bd6e94396bf95c473/stablelm-2-zephyr-1_6b/tokenizer.json'))

['stablelm-2-zephyr-1_6b-Q4_1.gguf', 'stablelm-2-zephyr-1_6b-OpenVINO-4bit.xml', 'stablelm-2-zephyr-1_6b-Q4_0.gguf', 'stablelm-2-zephyr-1_6b.gguf', 'stablelm-2-zephyr-1_6b-OpenVINO-4bit.bin', 'merges.txt', 'tokenizer_config.json', 'modeling_stablelm.py', 'special_tokens_map.json', 'tokenizer.json', 'configuration_stablelm.py', 'config.json', '.gitattributes', 'model.safetensors', 'README.md', 'LICENSE', '.git', 'stablelm-2-zephyr-1_6b-Q8_0.gguf', 'generation_config.json', 'vocab.json', 'stablelm-2-zephyr-1_6b-Q5_K_M.gguf']


In [None]:
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

# 加载配置和模型
from configuration_stablelm import StableLMConfig
from modeling_stablelm import StableLMEpochForCausalLM

config = StableLMConfig.from_pretrained("config.json")
model = StableLMEpochForCausalLM.from_pretrained("model.safetensors", config=config)
tokenizer = AutoTokenizer.from_pretrained("tokenizer.json")

# 使用数据整理器进行数据整理
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 设置训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # 使用混合精度训练以加速训练
    logging_dir='./logs',
    logging_steps=500
)

# 自定义回调函数
from transformers import TrainerCallback

class ProgressCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Epoch {state.epoch} ended. Loss: {state.log_history[-1]['loss']:.4f}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            print(f"Step {state.global_step}: Loss: {logs['loss']:.4f}")

# 创建Trainer对象并添加自定义回调
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[ProgressCallback]
)

# 训练模型
trainer.train()

In [None]:
# 定义推理函数
def predict_similarity(product_name_1, about_product_1, product_name_2, about_product_2):
    text = f"Product 1: {product_name_1} {about_product_1} Product 2: {product_name_2} {about_product_2}"
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model(**inputs)
    similarity = outputs.logits.squeeze().item()
    return similarity

# 测试推理
product_name_1 = "Product A"
about_product_1 = "This is the description of product A."
product_name_2 = "Product B"
about_product_2 = "This is the description of product B."
similarity_score = predict_similarity(product_name_1, about_product_1, product_name_2, about_product_2)
print(f"Similarity score: {similarity_score}")

In [None]:
# 训练结束后保存模型和tokenizer
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')

In [None]:
!pip install shutil

In [None]:
import shutil

# 压缩模型文件夹
shutil.make_archive('trained_model', 'zip', './trained_model')

# 下载压缩文件（使用Jupyter Notebook中的文件浏览器下载）

In [None]:
# Follow Tutorial below

In [1]:
conda create -n llm python=3.9
conda activate llm
pip install --pre --upgrade bigdl-llm[xpu] -f https://developer.intel.com/ipex-whl-stable-xpu


SyntaxError: invalid syntax (2633847937.py, line 1)