In [42]:
from google.colab import drive

drive.mount('/content/drive',force_remount=True)


Mounted at /content/drive


In [43]:
!pip install transformers
!pip install --upgrade transformers
!pip install datasets



In [44]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Langchain/gm_fashions_dataset.csv'
data = pd.read_csv(file_path)

pd.set_option('display.max_rows', None)
data


Unnamed: 0,Question,Answer,Type
0,What is GM Fashions?,GM Fashions is an online clothing store offeri...,train
1,Where is GM Fashions located?,"GM Fashions is primarily located in Chennai, I...",train
2,How can I contact GM Fashions?,You can contact GM Fashions via email at suppo...,train
3,What categories of clothing does GM Fashions o...,GM Fashions offers men's clothing categories s...,train
4,What special sections are available at GM Fash...,GM Fashions has a special Offer Zone that feat...,train
5,What is the primary location of GM Fashions?,The primary location of GM Fashions is Chennai...,train
6,What are the showroom locations of GM Fashions?,GM Fashions has multiple showroom locations ac...,test
7,What is the head office address of GM Fashions?,The head office address of GM Fashions is in C...,test


In [45]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = '/content/drive/MyDrive/Colab Notebooks/Langchain/Distilgpt'

# Load the tokenizer and model using AutoTokenizer and AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

print(model.__class__.__name__)
print(tokenizer.__class__.__name__)
print(model.config)

GPT2LMHeadModel
GPT2TokenizerFast
GPT2Config {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/Langchain/Distilgpt",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "tra

In [46]:
input_text = "What special sections are available at GM Fash...	"
inputs = tokenizer(input_text, return_tensors='pt')
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What special sections are available at GM Fash...	.com.








In [47]:
from datasets import Dataset

# Rename columns for clarity
data = data.rename(columns={"Question": "input_text", "Answer": "response_text"})

# Convert the dataset to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Print to confirm dataset structure
print(dataset)


Dataset({
    features: ['input_text', 'response_text', 'Type'],
    num_rows: 8
})


In [48]:
# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    # Tokenize the input and output text
    inputs = tokenizer(examples['input_text'], truncation=True, padding='max_length', max_length=512)
    labels = tokenizer(examples['response_text'], truncation=True, padding='max_length', max_length=512)
    inputs['labels'] = labels['input_ids']
    return inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [50]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=200,
    weight_decay=0.01,
)


In [51]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,7.321436
2,No log,5.224422
3,No log,3.961826
4,No log,2.691456
5,No log,1.542867
6,No log,0.884219
7,No log,0.629269
8,No log,0.586772
9,No log,0.561269
10,No log,0.531303


TrainOutput(global_step=400, training_loss=0.3082928848266602, metrics={'train_runtime': 209.1421, 'train_samples_per_second': 7.65, 'train_steps_per_second': 1.913, 'total_flos': 209037400473600.0, 'train_loss': 0.3082928848266602, 'epoch': 200.0})

In [52]:
def get_response(input_text, temperature=0.1, max_length=600): # Adjust max_length here
    # You need to define or import the 'chatbot' function here.
    # For example, if you are using a pipeline from transformers:
    from transformers import pipeline
    chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

    response = chatbot(input_text, max_length=max_length, num_return_sequences=1, temperature=temperature)
    return response[0]["generated_text"]



In [53]:
# Save the potentially improved model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt")

('/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/merges.txt',
 '/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt/tokenizer.json')

In [54]:
# Check if GPU is available
import torch
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("GPU is available and being used.")
else:
  device = torch.device("cpu")
  print("GPU is not available, using CPU.")

# Move model to GPU if available
model.to(device)

GPU is available and being used.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [55]:
# Reload the model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
# Import the pipeline function
from transformers import pipeline

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt')
model = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/Langchain/NewFt')


def get_response(input_text, temperature=0.1, max_length=600):
    # Pass the device to the pipeline
    chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

    response = chatbot(input_text, max_length=max_length, num_return_sequences=1, temperature=temperature)
    return response[0]["generated_text"]

In [56]:
user_input = "What are the showroom locations of GM Fashions?"
response = get_response(user_input)
print(response)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


What are the showroom locations of GM Fashions?.
