transformer model

In [None]:
# Install necessary libraries (uncomment if needed)
!pip install transformers datasets torch scikit-learn
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

class ProductDescriptionGenerator:

    def __init__(self, model_name="t5-base"):
        """
        Initialize the model, tokenizer, and configurations.
        """
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

    def preprocess_data(self, title, keywords, description):
        """
        Preprocess input data by combining title and keywords with a prefix.
        """
        task_prefix = "generate description: "
        input_text = f"{task_prefix} Title: {title} Keywords: {keywords}"
        inputs = self.tokenizer(input_text, return_tensors="np", padding="max_length", truncation=True, max_length=60)
        targets = self.tokenizer(description, return_tensors="np", padding="max_length", truncation=True, max_length=100)

        labels = targets.input_ids[0]
        labels[labels == self.tokenizer.pad_token_id] = -100  # Mask padding tokens

        return {
            "input_ids": inputs.input_ids[0],
            "attention_mask": inputs.attention_mask[0],
            "labels": labels,
        }



    def prepare_datasets(self, dataset_path):

        # Specify encoding to handle non-UTF-8 characters
        dataset = pd.read_csv(dataset_path, encoding="ISO-8859-1")  # Try 'latin1' or 'cp1252' if needed
        assert "title" in dataset.columns and "keyword" in dataset.columns and "description" in dataset.columns, \
            "Dataset must contain 'title', 'keyword', and 'description' columns."

        # Split data into training and validation sets
        train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

        # Preprocess datasets
        train_data = {
          "title": train_df["title"].values,
          "keywords": train_df["keyword"].values,
          "description": train_df["description_new"].values,
        }

        val_data = {
          "title": val_df["title"].values,
          "keywords": val_df["keyword"].values,
          "description": val_df["description_new"].values,
        }


        train_dataset = Dataset.from_dict(train_data)
        val_dataset = Dataset.from_dict(val_data)

        train_dataset = train_dataset.map(lambda x: self.preprocess_data(x["title"], x["keywords"], x["description"]))
        val_dataset = val_dataset.map(lambda x: self.preprocess_data(x["title"], x["keywords"], x["description"]))

        train_tf_dataset = train_dataset.to_tf_dataset(
            columns=["input_ids", "attention_mask"],
            label_cols="labels",
            batch_size=8,
            shuffle=True,
        )
        val_tf_dataset = val_dataset.to_tf_dataset(
            columns=["input_ids", "attention_mask"],
            label_cols="labels",
            batch_size=8,
        )

        return train_tf_dataset, val_tf_dataset

    # ... (other methods) ...

    def train_model(self, train_dataset, val_dataset, epochs=5, learning_rate=5e-5):
        """
        Fine-tune the model on the prepared dataset.
        """
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
        self.model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)
        print("Training complete!")

    def save_model(self, save_path="./fine_tuned_t5_model"):
        """
        Save the fine-tuned model and tokenizer.
        """
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"Model and tokenizer saved to {save_path}")

    def generate_description(self, title, keywords):
        """
        Generate a detailed description from title and keywords.
         """
        task_prefix = "generate description: "
        input_text = f"{task_prefix} Title: {title} Keywords: {keywords}"

        # Tokenize input text
        inputs = self.tokenizer(
            input_text,
            return_tensors="tf",
            padding="max_length",
            truncation=True,
            max_length=60
        )

    # Generate output
        outputs = self.model.generate(
           inputs["input_ids"],
            max_length=200,
            num_beams=7,

            top_k=50,
            top_p=0.95,
            temperature=0.7,
            num_return_sequences=1,
            early_stopping=True,
          )

    # Decode and return the generated description
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


if __name__ == "__main__":
    # Step 1: Initialize the class
    generator = ProductDescriptionGenerator()

    # Step 2: Prepare the datasets
    dataset_path = "/content/dataset.csv"
    train_dataset, val_dataset = generator.prepare_datasets(dataset_path)

    # Step 3: Fine-tune the model
    generator.train_model(train_dataset, val_dataset, epochs=5)


    generator.save_model("./fine_tuned_t5_model1")

    # Step 5: Test the model with a sample input
    test_title = "Apple MacBook Pro"
    test_keywords = "laptop, high performance, sleek design"
    generated_description = generator.generate_description(test_title, test_keywords)

    print("Input Title:", test_title)
    print("Input Keywords:", test_keywords)
    print("Generated Description:", generated_description)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Map:   0%|          | 0/676 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training complete!
Model and tokenizer saved to ./fine_tuned_t5_model1




Input Title: Apple MacBook Pro
Input Keywords: laptop, high performance, sleek design
Generated Description: The Apple MacBook Pro is a high-performance laptop that combines high performance with a sleek design. With a sleek design and powerful processors, its perfect for both work and entertainment.


NameError: name 'model' is not defined

In [None]:
model.save('AI_PRODUCT-GENERATOR.h5')
print("Model saved asAI_PRODUCT-GENERATOR.h5")

In [None]:
test_title = "Sony WH-1000XM5 Wireless Headphones"
test_keywords = "Noise-canceling, Bluetooth, over-ear, long battery life, touch controls, microphone, adaptive sound, comfortable, headphones, audio quality"

generated_description = generator.generate_description(test_title, test_keywords)

print("Input Title:", test_title)
print("Input Keywords:", test_keywords)
print("Generated Description:", generated_description)


Input Title: Sony WH-1000XM5 Wireless Headphones
Input Keywords: Noise-canceling, Bluetooth, over-ear, long battery life, touch controls, microphone, adaptive sound, comfortable, headphones, audio quality
Generated Description: The Sony WH-1000XM5 Wireless Headphones offer superior sound quality with adaptive noise cancellation and a comfortable fit. With a comfortable fit, long battery life, and a comfortable fit, they provide a comfortable listening experience for both work and play.


In [None]:
test_title = "adidas classic cap"
test_keywords = "hat,baseball cap,adjustable,sportswear,casual,unisex,breathable,logo,everyday wear,outdoor"

generated_description = generator.generate_description(test_title, test_keywords)

print("Input Title:", test_title)
print("Input Keywords:", test_keywords)
print("Generated Description:", generated_description)

Input Title: Adidas Classic Cap
Input Keywords: hat, baseball cap, adjustable, sportswear, casual, unisex, breathable, logo, everyday wear, outdoor
Generated Description: The Adidas Classic Cap is a versatile sportswear accessory that combines comfort and style for everyday wear. Its adjustable strap ensures a comfortable fit and comfortable fit, making it ideal for both casual and active wear.
