## data creating

In [1]:
import json
import pandas as pd

# Define the path to your JSONL file
jsonl_file_path = 'data.jsonl'
# Define the path to the output CSV file
csv_file_path = 'output.csv'

# Read the JSONL file and store the data in a list
data = []
with open(jsonl_file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data)

# Rename columns to match the required CSV format
df.columns = ['input_text', 'response_text']

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

print(f"Data successfully converted to {csv_file_path}")


Data successfully converted to output.csv


In [2]:
import pandas as pd
from datasets import load_dataset, Dataset

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('output.csv')

# Display the DataFrame to check if the data is loaded correctly
print(df.head())

# Convert the DataFrame to a Dataset
dataset = Dataset.from_pandas(df)

# Display the Dataset to verify the conversion
print(dataset)

# You can now use this dataset to train your model.
# For example, to save the dataset to a file:
dataset.to_csv('prepared_customer_seller_data.csv', index=False)

# To load the dataset later:
loaded_dataset = load_dataset('csv', data_files='prepared_customer_seller_data.csv')

# Display the loaded dataset to verify
print(loaded_dataset)


  from .autonotebook import tqdm as notebook_tqdm


                                     input_text  \
0                  What is the price of the TV?   
1                  Do you have any gas cookers?   
2                              I need a fridge.   
3  Can you tell me about your Samsung products?   
4     Do you have any discounts on electronics?   

                                       response_text  
0       Sure, let me check the latest price for you.  
1  Sure, let me see what gas cookers we have in s...  
2        Let me check the available fridges for you.  
3  I can provide you with the latest details on o...  
4  Let me check if we have any discounts on elect...  
Dataset({
    features: ['input_text', 'response_text'],
    num_rows: 71
})


Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 60.72ba/s]
Generating train split: 71 examples [00:00, 5065.15 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'response_text'],
        num_rows: 71
    })
})





## training model

In [15]:
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

# Use the correct file path
file_path = 'prepared_customer_seller_data.csv'

try:
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Combine input and response text for causal language modeling
    df['text'] = df['input_text'] + " " + df['response_text']
    
    # Convert the DataFrame to a Dataset
    dataset = Dataset.from_pandas(df[['text']])
    
    # Initialize the tokenizer and model
    model_name = 'gpt2'  # Change to 'gpt2-small' if needed
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    # Add a padding token if not already present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding token

    # Tokenize the combined text
    def tokenize_function(example):
        return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=1,
        num_train_epochs=3,  # Adjust the number of epochs as needed
        logging_dir='./logs',
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
    )
    
    # Train the model
    trainer.train()
    
    # Save the model
    model.save_pretrained('custom_seller_bot_model')
    tokenizer.save_pretrained('custom_seller_bot_model')
    
    print("Training complete and model saved.")
    
except FileNotFoundError:
    print(f"The file at {file_path} was not found. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")


Map: 100%|██████████| 71/71 [00:00<00:00, 2268.49 examples/s]

An error occurred: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`





In [6]:
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, GPT2Tokenizer

# Use the correct file path
file_path = 'prepared_customer_seller_data.csv'

try:
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Combine input and response text for causal language modeling
    df['text'] = df['input_text'] + " " + df['response_text']
    
    # Convert the DataFrame to a Dataset
    dataset = Dataset.from_pandas(df[['text']])
    
    # Initialize the tokenizer and model
    model_name = 'gpt2'  # or 'gpt2-small', 'gpt2-medium', 'gpt2-large', etc.
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    # Add a padding token if not already present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding token

    # Tokenize the combined text
    def tokenize_function(example):
        encodings = tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
        encodings["labels"] = encodings["input_ids"].copy()  # Use input_ids as labels
        return encodings
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    
    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=1,
        num_train_epochs=3,
        logging_dir='./logs',
        logging_steps=10,
        save_steps=500,
        save_total_limit=2,
        report_to=None,  # Ensure you're not using any unsupported reporting options
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
    )
    
    # Train the model
    trainer.train()
    
    # Save the model
    model.save_pretrained('custom_seller_bot_model')
    tokenizer.save_pretrained('custom_seller_bot_model')
    
    print("Training complete and model saved.")
    
except FileNotFoundError:
    print(f"The file at {file_path} was not found. Please check the path and try again.")
except Exception as e:
    print(f"An error occurred: {e}")


Map: 100%|██████████| 71/71 [00:00<00:00, 557.97 examples/s]
  0%|          | 0/213 [03:29<?, ?it/s]
  5%|▍         | 10/213 [03:08<36:24, 10.76s/it] 

{'loss': 2.2613, 'grad_norm': 2.626744270324707, 'learning_rate': 4.765258215962441e-05, 'epoch': 0.14}


  9%|▉         | 20/213 [04:33<27:38,  8.59s/it]

{'loss': 0.0983, 'grad_norm': 1.6586312055587769, 'learning_rate': 4.530516431924883e-05, 'epoch': 0.28}


 14%|█▍        | 30/213 [06:07<29:19,  9.61s/it]

{'loss': 0.0612, 'grad_norm': 1.7735347747802734, 'learning_rate': 4.295774647887324e-05, 'epoch': 0.42}


 19%|█▉        | 40/213 [07:35<24:59,  8.67s/it]

{'loss': 0.0396, 'grad_norm': 1.2576494216918945, 'learning_rate': 4.0610328638497654e-05, 'epoch': 0.56}


 23%|██▎       | 50/213 [09:03<23:25,  8.62s/it]

{'loss': 0.0312, 'grad_norm': 1.0658440589904785, 'learning_rate': 3.826291079812207e-05, 'epoch': 0.7}


 28%|██▊       | 60/213 [10:30<22:19,  8.76s/it]

{'loss': 0.0268, 'grad_norm': 0.5717741250991821, 'learning_rate': 3.5915492957746486e-05, 'epoch': 0.85}


 33%|███▎      | 70/213 [11:55<20:10,  8.47s/it]

{'loss': 0.0302, 'grad_norm': 1.3320704698562622, 'learning_rate': 3.3568075117370895e-05, 'epoch': 0.99}


 38%|███▊      | 80/213 [13:20<18:53,  8.52s/it]

{'loss': 0.0155, 'grad_norm': 0.47770926356315613, 'learning_rate': 3.1220657276995305e-05, 'epoch': 1.13}


 42%|████▏     | 90/213 [14:44<17:25,  8.50s/it]

{'loss': 0.0162, 'grad_norm': 1.3124406337738037, 'learning_rate': 2.887323943661972e-05, 'epoch': 1.27}


 47%|████▋     | 100/213 [16:09<16:05,  8.54s/it]

{'loss': 0.0245, 'grad_norm': 0.5552031397819519, 'learning_rate': 2.6525821596244134e-05, 'epoch': 1.41}


 52%|█████▏    | 110/213 [17:33<15:34,  9.08s/it]

{'loss': 0.0171, 'grad_norm': 0.6141884326934814, 'learning_rate': 2.4178403755868547e-05, 'epoch': 1.55}


 56%|█████▋    | 120/213 [18:57<12:53,  8.32s/it]

{'loss': 0.0151, 'grad_norm': 0.5859781503677368, 'learning_rate': 2.1830985915492956e-05, 'epoch': 1.69}


 61%|██████    | 130/213 [20:21<11:30,  8.32s/it]

{'loss': 0.0184, 'grad_norm': 0.5368008017539978, 'learning_rate': 1.9483568075117372e-05, 'epoch': 1.83}


 66%|██████▌   | 140/213 [21:46<10:32,  8.66s/it]

{'loss': 0.0149, 'grad_norm': 0.4845516085624695, 'learning_rate': 1.7136150234741785e-05, 'epoch': 1.97}


 70%|███████   | 150/213 [25:51<19:27, 18.53s/it]

{'loss': 0.0164, 'grad_norm': 0.4224061071872711, 'learning_rate': 1.4788732394366198e-05, 'epoch': 2.11}


 75%|███████▌  | 160/213 [27:14<07:40,  8.70s/it]

{'loss': 0.0129, 'grad_norm': 1.1524507999420166, 'learning_rate': 1.2441314553990612e-05, 'epoch': 2.25}


 80%|███████▉  | 170/213 [28:38<05:49,  8.12s/it]

{'loss': 0.0108, 'grad_norm': 0.6417281627655029, 'learning_rate': 1.0093896713615023e-05, 'epoch': 2.39}


 85%|████████▍ | 180/213 [30:00<04:29,  8.18s/it]

{'loss': 0.016, 'grad_norm': 0.4205755293369293, 'learning_rate': 7.746478873239436e-06, 'epoch': 2.54}


 89%|████████▉ | 190/213 [31:24<03:14,  8.46s/it]

{'loss': 0.0103, 'grad_norm': 0.519544243812561, 'learning_rate': 5.3990610328638506e-06, 'epoch': 2.68}


 94%|█████████▍| 200/213 [32:47<01:47,  8.30s/it]

{'loss': 0.013, 'grad_norm': 0.6004200577735901, 'learning_rate': 3.051643192488263e-06, 'epoch': 2.82}


 99%|█████████▊| 210/213 [34:09<00:24,  8.12s/it]

{'loss': 0.0192, 'grad_norm': 0.5160269737243652, 'learning_rate': 7.042253521126761e-07, 'epoch': 2.96}


100%|██████████| 213/213 [35:02<00:00,  9.87s/it]


{'train_runtime': 2102.3276, 'train_samples_per_second': 0.101, 'train_steps_per_second': 0.101, 'train_loss': 0.13021979012259854, 'epoch': 3.0}
Training complete and model saved.


In [13]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Initialize model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

print("Model and tokenizer loaded successfully.")


PyTorch version: 2.3.1+cpu
Model and tokenizer loaded successfully.


In [16]:
import accelerate
import transformers

print(f"Accelerate version: {accelerate.__version__}")
print(f"Transformers version: {transformers.__version__}")


Accelerate version: 0.33.0
Transformers version: 4.43.4


In [5]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = 'gpt2'  # or 'gpt2-small'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


## testing model

In [8]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the trained model and tokenizer
model_name = 'custom_seller_bot_model'  # Directory where your trained model is saved
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

def generate_response(prompt, max_length=100):
    # Tokenize the input prompt with padding
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    
    # Extract input_ids and attention_mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # Generate text from the model
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,  # Adjust temperature for creativity vs. coherence
            top_p=0.9,  # Adjust top-p for diverse sampling
            pad_token_id=tokenizer.pad_token_id,
        )

    # Decode the generated text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

# Example inputs to test the model
inputs = [
    "What is the price of the TV?",
    "Do you have any gas cookers?",
    "I need a fridge.",
    "Can you tell me about your Samsung products?",
    "Do you have any discounts on electronics?"
]

# Generate and print responses for each input
for input_text in inputs:
    response = generate_response(input_text)
    print(f"Input: {input_text}")
    print(f"Response: {response}")
    print("-" * 50)


Input: What is the price of the TV?
Response: What is the price of the TV? Sure, let me check the latest price for you.
--------------------------------------------------
Input: Do you have any gas cookers?
Response: Do you have any gas cookers? Let me check if we have any gas cookers.
--------------------------------------------------
Input: I need a fridge.
Response: I need a fridge. Let me check the available fridges for you.
--------------------------------------------------
Input: Can you tell me about your Samsung products?
Response: Can you tell me about your Samsung products? I can provide you with the latest details on our Samsung products.
--------------------------------------------------
Input: Do you have any discounts on electronics?
Response: Do you have any discounts on electronics? Let me check if we have any discounts on electronics.
--------------------------------------------------


## manual input giving

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the trained model and tokenizer
model_name = 'custom_seller_bot_model'  # Directory where your trained model is saved
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

def generate_response(prompt, max_length=100):
    # Tokenize the input prompt with padding
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    
    # Extract input_ids and attention_mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    # Generate text from the model
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7,  # Adjust temperature for creativity vs. coherence
            top_p=0.9,  # Adjust top-p for diverse sampling
            pad_token_id=tokenizer.pad_token_id,
        )

    # Decode the generated text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text

# Interactive loop to get user input
print("Interactive Model Testing")
print("Type 'exit' to end the session.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Exiting...")
        break
    response = generate_response(user_input)
    print(f"Model: {response}")
    print("-" * 50)


Interactive Model Testing
Type 'exit' to end the session.
Model: do you have tv? Sure, let me see what tv we have in stock.
--------------------------------------------------
Model: i want buy a tv. Let me check the latest price for you.
--------------------------------------------------
Model: hello, let me check the latest price for you.
--------------------------------------------------
Model: do you have rice cookers? Sure, let me see what rice cookers we have in stock.
--------------------------------------------------


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous