In [None]:
!pip install transformers
!pip install accelerate -U
!pip install datasets

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
train_df = pd.read_csv("/kaggle/input/ds200-res16/res16/big_train.csv")
train_df.head()

Unnamed: 0,x_input,y_output
0,We have gone for dinner only a few times but t...,"service, service general, positive, great; din..."
1,"Its dark , and cozy . . there is always jazz m...","NULL, ambience general, positive, cozy"
2,This place has great indian chinese food .,"indian chinese food, food quality, positive, g..."
3,Not what I would expect for the price and pres...,"location, restaurant prices, neutral, expect; ..."
4,Finally a reliable Chinese restaurant !,"Chinese restaurant, restaurant general, positi..."


In [None]:
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_df)

dataset_dict= DatasetDict()
dataset_dict['train'] = tds
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['x_input', 'y_output'],
        num_rows: 1580
    })
})


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM

# VietAI/vit5-large
tokenizer_T5 = AutoTokenizer.from_pretrained('T5-base')
model_T5 = AutoModelForSeq2SeqLM.from_pretrained('T5-base', device_map="auto")

2024-06-13 16:38:15.479708: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 16:38:15.479758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 16:38:15.481294: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
output_encodings = tokenizer_T5(train_df["y_output"].tolist(), truncation=True, padding=True)
input_encodings = tokenizer_T5(train_df["x_input"].tolist(), truncation=True, padding=True)

def find_max_list(list):
    list_len = [len(i) for i in list]
    print(max(list_len))

#print output#
find_max_list(input_encodings["input_ids"])
find_max_list(output_encodings["input_ids"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


113
91


In [None]:
max_input_length = 256
max_output_length = 256
text_column = "x_input"
label_column = "y_output"

def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer_T5(text_target=sample[text_column], max_length=max_input_length, padding=padding, truncation=True)
    labels = tokenizer_T5(text_target=sample[label_column], max_length=max_output_length, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer_T5.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset_dict.map(preprocess_function, batched=True, remove_columns=["x_input", "y_output"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/1580 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer_T5,
    model=model_T5,
    label_pad_token_id=label_pad_token_id
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Trainer, TrainingArguments



model_name= f"DS200-big_train-res16-T5"
batch_size = 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
epochs = 30

# Define training args
training_args = TrainingArguments(
    output_dir=model_name,
    #auto_find_batch_size= True,
    per_device_train_batch_size= batch_size,
    learning_rate= 3e-4, # higher learning rate
    num_train_epochs= 30,
    logging_strategy="epoch",
    logging_steps= logging_steps,
    disable_tqdm=False,
    push_to_hub = True
)

# Create Trainer instance
trainer_T5 = Trainer(
    model=model_T5,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer_T5
)
#model_T5.config.use_cache = False  # silence the warnings. Please re-enable for inference!

# train model
trainer_T5.train()
trainer_T5.push_to_hub()

[34m[1mwandb[0m: Currently logged in as: [33m21521514[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
99,0.8843
198,0.2886
297,0.1821
396,0.1262
495,0.0909
594,0.0734
693,0.0624
792,0.0453
891,0.0357
990,0.0319


events.out.tfevents.1718296703.4230cb7e35c2.469.0:   0%|          | 0.00/12.4k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ThuyNT03/DS200-big_train-res16-T5/commit/a5273205a6e31cb08c51baf2f2ef7434f9c6b9a9', commit_message='End of training', commit_description='', oid='a5273205a6e31cb08c51baf2f2ef7434f9c6b9a9', pr_url=None, pr_revision=None, pr_num=None)

### inference

In [None]:
def get_prediction_seqtrainer(review,tokenizer_T5,model_T5):
    input_ids = tokenizer_T5(review, max_length=max_input_length, return_tensors="pt", padding="max_length", truncation=True).input_ids.cuda()
    outputs = model_T5.generate(input_ids=input_ids, max_new_tokens=max_output_length, do_sample=True)
    pred = tokenizer_T5.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return pred

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM
max_input_length = 256
max_output_length = 256
model_id = 'ThuyNT03/DS200-big_train-res16-T5'
tokenizer_T5 = AutoTokenizer.from_pretrained(model_id)
model_T5 = AutoModelForSeq2SeqLM.from_pretrained(model_id).cuda()
print(get_prediction_seqtrainer("I waited for 10-15 minutes for service ordered a beer & was never served again .", tokenizer_T5,model_T5))


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

NULL, service general, negative, waited for 10-15 minutes


In [None]:
print(get_prediction_seqtrainer("The ambiance was a peaceful and relaxing break amongst all the kids running around in Downtown Disney .", tokenizer_T5,model_T5))


ambiance, ambience general, positive, peaceful; ambiance, ambience general, positive, relaxing


## tập test

In [None]:
test_df = pd.read_csv("/kaggle/input/ds200-res16/res16/test.csv")
test_input = list(test_df['x_input'])
test_output = list(test_df['y_output'])
print(test_input[0:5])
print(test_output[0:5])
test_df.head()

['I waited for 10-15 minutes for service ordered a beer & was never served again .', 'The ambiance was a peaceful and relaxing break amongst all the kids running around in Downtown Disney .', 'hidden little jem', 'I went there with a friend from out of town ... and we were both very impressed !', 'It has great sushi and even better service .']
['service, service general, negative, never served again', 'ambiance, ambience general, positive, peaceful', 'NULL, restaurant general, positive, hidden', 'NULL, restaurant general, positive, impressed', 'sushi, food quality, positive, great; service, service general, positive, better']


Unnamed: 0,x_input,y_output
0,I waited for 10-15 minutes for service ordered...,"service, service general, negative, never serv..."
1,The ambiance was a peaceful and relaxing break...,"ambiance, ambience general, positive, peaceful"
2,hidden little jem,"NULL, restaurant general, positive, hidden"
3,I went there with a friend from out of town .....,"NULL, restaurant general, positive, impressed"
4,It has great sushi and even better service .,"sushi, food quality, positive, great; service,..."


In [None]:
test_pred = []
for sen in tqdm(test_input):
    pred = get_prediction_seqtrainer(sen, tokenizer_T5,model_T5)
    test_pred.append(pred)
test_df['predict'] = test_pred
test_df.to_csv('/kaggle/working/res16_test_result.csv')


100%|██████████| 544/544 [02:39<00:00,  3.40it/s]


In [None]:
test_df.to_csv('/kaggle/working/res16_test_result.csv')
test_df.head()

Unnamed: 0,x_input,y_output,predict
0,I waited for 10-15 minutes for service ordered...,"service, service general, negative, never serv...","NULL, service general, negative, waited for 10..."
1,The ambiance was a peaceful and relaxing break...,"ambiance, ambience general, positive, peaceful","ambiance, ambience general, positive, peaceful..."
2,hidden little jem,"NULL, restaurant general, positive, hidden","jem, food style_options, negative, hidden"
3,I went there with a friend from out of town .....,"NULL, restaurant general, positive, impressed","NULL, restaurant general, positive, impressed"
4,It has great sushi and even better service .,"sushi, food quality, positive, great; service,...","sushi, food quality, positive, great; service,..."


# Pyspark

In [None]:
pip install transformers torch pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=28fd8d7b5d1640cb0f18af95b736d36eba5c5e2d8f9d12d8c6f6c29e119f916f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Khởi tạo Spark Session
spark = SparkSession.builder \
    .appName("HuggingFaceInference") \
    .getOrCreate()

# Định nghĩa các hàm inference của bạn
def get_prediction_seqtrainer(review):
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    import torch

    model_id = 'ThuyNT03/DS200-big_train-res16-T5'
    tokenizer_T5 = AutoTokenizer.from_pretrained(model_id)
    model_T5 = AutoModelForSeq2SeqLM.from_pretrained(model_id)

    max_input_length = 256
    max_output_length = 256

    input_ids = tokenizer_T5(review, max_length=max_input_length, return_tensors="pt", padding="max_length", truncation=True).input_ids
    outputs = model_T5.generate(input_ids=input_ids, max_new_tokens=max_output_length, do_sample=True)
    pred = tokenizer_T5.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

    return pred

# Đăng ký hàm inference như một UDF
get_prediction_udf = udf(get_prediction_seqtrainer, StringType())

# Tạo DataFrame mẫu
data = [("I waited for 10-15 minutes for service ordered a beer & was never served again .",)]
columns = ["review"]
df = spark.createDataFrame(data, columns)

# Áp dụng UDF lên DataFrame
df_with_predictions = df.withColumn("prediction", get_prediction_udf(df.review))

# Hiển thị kết quả
df_with_predictions.show(truncate=False)


                                                                                

+--------------------------------------------------------------------------------+---------------------------------------------------------+
|review                                                                          |prediction                                               |
+--------------------------------------------------------------------------------+---------------------------------------------------------+
|I waited for 10-15 minutes for service ordered a beer & was never served again .|NULL, service general, negative, waited for 10-15 minutes|
+--------------------------------------------------------------------------------+---------------------------------------------------------+



In [None]:
# def get_prediction_seqtrainer(review,tokenizer_T5,model_T5):
#     input_ids = tokenizer_T5(review, max_length=max_input_length, return_tensors="pt", padding="max_length", truncation=True).input_ids.cuda()
#     outputs = model_T5.generate(input_ids=input_ids, max_new_tokens=max_output_length, do_sample=True)
#     pred = tokenizer_T5.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
#     return pred
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from torch.utils.data import DataLoader
# from transformers import default_data_collator, get_linear_schedule_with_warmup
# from transformers import AutoModelForSeq2SeqLM
# max_input_length = 256
# max_output_length = 256
# model_id = 'ThuyNT03/DS200-big_train-res16-T5'
# tokenizer_T5 = AutoTokenizer.from_pretrained(model_id)
# model_T5 = AutoModelForSeq2SeqLM.from_pretrained(model_id).cuda()
# print(get_prediction_seqtrainer("I waited for 10-15 minutes for service ordered a beer & was never served again .", tokenizer_T5,model_T5))
