In [None]:
# 准备环境
!pip install transformers
!pip install datasets
!pip install tensorflow
!pip install nltk==3.5
!pip install evaluate

In [None]:
# 获取上传的数据集
!git clone https://github.com/Kh-Chin/transformer-data.git

In [None]:
# 将每个领域数据集进行合并
import numpy as np
import pandas as pd
import glob
import os


path = '/content/transformer-data'
all_files = glob.glob(os.path.join(path , "*.csv"))

li = []

for filename in all_files:
    file = pd.read_csv(filename, index_col=0, sep="|")
    print(f'file {filename} read!')
    li.append(file)

raw_df = pd.concat(li, axis=0, ignore_index=True)
print("Read successful!")


In [None]:
raw_df.head()

In [None]:
# df = df.dropna()
# df

In [None]:
df = raw_df.drop_duplicates()
df.dropna(how='all')
df = df[~df['Job desc'].isna()]

In [None]:
df.shape

In [None]:
df['cleaner_Job_desc'] = None
df['Input'] = None
df['text'] = None
df['cleaner_text'] = None

In [None]:
# 数据预处理，与输入构建
import re
def preprocessing(df):
  pat = r"^About us:.*?\n(?=(Responsibilities:|Education, Qualifications & Experience:|Skills & Abilities:|Other:|About us:|\Z))"
  for index, row in df.iterrows():
    df.loc[index, 'cleaner_Job_desc'] = re.sub(pat, "", row['Job desc'], flags=re.I|re.M|re.S)
    df.loc[index, 'Input'] = f"<bos>\nJob Description for {row['Job title']} which uses {row['Skills']} skill:\n<desc>\n"
    df.loc[index, 'text'] = row['Input'] + row['Job desc'] + "\n<eos>"
    df.loc[index, 'cleaner_text'] = row['Input'] + row['cleaner_Job_desc'] + "\n<eos>"
  return df

In [None]:
df = preprocessing(df)

In [None]:
df.columns

In [None]:
print(df['text'][106])
print(df['cleaner_text'][106])
print(df['Input'][106])

In [None]:
from datasets import Dataset
clean_df = df[['Input','text']]

clean_df = Dataset.from_pandas(clean_df)
print(clean_df)

In [None]:
# 构建编码层和模型
import tensorflow as tf
from transformers import BertTokenizer, TFBertLMHeadModel, GPT2Tokenizer, TFGPT2LMHeadModel

SPECIAL_TOKENS_MAPPING = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<desc>']
}

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='right')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

orig_num_tokens = len(tokenizer.get_vocab())
num_special_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS_MAPPING)

model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_special_tokens)

In [None]:
# 进行输入编码
import psutil

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    input_tokens = tokenizer(data['text'], truncation=True, max_length=1024, padding='max_length')
    return {'input_ids': input_tokens['input_ids'],
            'labels': input_tokens['input_ids'], 
            'attention_mask': input_tokens['attention_mask']
    }
CPU_COUNT = psutil.cpu_count()
tokenized_df = clean_df.map(tokenize_dataset, batched=True, num_proc=CPU_COUNT)
# tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
print(tokenized_df)

In [None]:
print(len(tokenized_df['labels'][-1]))
print(tokenized_df['input_ids'][0])

In [None]:
# 数据集划分
split_df = tokenized_df.train_test_split(test_size=0.2)
print(split_df)

In [None]:
!pip install huggingface_hub
!huggingface-cli login

In [None]:
split_df.push_to_hub('keehuachin/clean')

In [None]:
split_df['test']['text'][0]

In [None]:
import datasets
split_df = datasets.load_dataset("keehuachin/clean")

In [None]:
# 数据集转化与预备
train_df = split_df['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=8,
    drop_remainder=True
)

test_df = split_df['test'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=['labels'],
    shuffle=False,
    batch_size=8,
    drop_remainder=True
)

In [None]:
print(train_df)
print(test_df)

In [None]:
!pip install rouge_score
!pip install deepspeed

In [None]:
# 模型训练
optimizer=tf.keras.optimizers.Adam(3e-5)
model.compile(
	optimizer=optimizer)

tf.compat.v1.ConfigProto(device_count = {'GPU': len(tf.config.experimental.list_physical_devices('GPU')) , 'CPU': CPU_COUNT})

model.fit(train_df, validation_data=test_df, epochs=5, batch_size=8)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Colab Notebooks/
model.save_pretrained("keehua-gpt2-final-clean", from_pt=True)

In [None]:
# 将模型进行存取和读取
import tensorflow as tf
from transformers import BertTokenizer, TFBertLMHeadModel, GPT2Tokenizer, TFGPT2LMHeadModel

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Colab Notebooks/

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
t_model = TFGPT2LMHeadModel.from_pretrained('/content/drive/My Drive/Colab Notebooks/keehua-gpt2-all-final-1', pad_token_id=tokenizer.eos_token_id)
SPECIAL_TOKENS_MAPPING = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<desc>']
}

orig_num_tokens = len(tokenizer.get_vocab())
num_special_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS_MAPPING)

In [None]:
!pip install rouge_score

In [None]:
import datasets
split_df = datasets.load_dataset("keehuachin/clean")
split_df

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import datasets
from tqdm import tqdm
import evaluate

path = '/content/drive/My Drive/Colab Notebooks/keehua-gpt2-final-clean'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
tokenizer.pad_token_id = tokenizer.eos_token_id
SPECIAL_TOKENS_MAPPING = {
    'bos_token': '<bos>',
    'eos_token': '<eos>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<desc>']
}

orig_num_tokens = len(tokenizer.get_vocab())
num_special_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS_MAPPING)
model = TFGPT2LMHeadModel.from_pretrained(path, pad_token_id=tokenizer.eos_token_id)

split_df = datasets.load_dataset("keehuachin/clean")

# 模型评价指标计算
pred_list = []
ref_list = []
df = split_df['test']
meteor = evaluate.load("meteor")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
for i in range(len(df)):
    tokens = tokenizer(df['Input'][i], truncation=True, max_length=128, padding='max_length', return_tensors='tf')
    pred = model.generate(tokens['input_ids'],
                        max_length = 1024,
                        no_repeat_ngram_size = 3,
                        early_stopping = True
                       )
    pred_list.append(tokenizer.decode(pred[0], skip_special_tokens=True))
    ref_list.append(df['text'][i])
    print(f"{i} done!")




In [None]:
meteor_score = meteor.compute(predictions=pred_list, references=ref_list)['meteor']
bleu_score = bleu.compute(predictions=pred_list, references=[[i] for i in ref_list])['bleu']
rouge_score = rouge.compute(predictions=pred_list, references=ref_list)

print(f"""
meteor_score: {meteor_score}
bleu_score: {bleu_score}
rouge_score: {rouge_score}
""")

In [None]:
import pandas as pd
output = pd.DataFrame([pred_list, ref_list]).transpose()
for i in range(5):

  print(f"{i} Pred:")
  print(output.loc[i,0])
  print(f"{i} True;")
  print(output.loc[i,1])

  print("Done!")

In [None]:
# 预测成果存取
file_path = "/content/drive/MyDrive/Colab Notebooks/clean_pred.csv"
output.to_csv(file_path)