In [2]:
import pandas as pd
import os 
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [8]:
device = torch.device("cpu")

In [4]:
def get_data_in_list(path):
    data_entries = []
    with open(path, 'r') as file:
        data = json.load(file)
        for i in range(len(data['data'])):
            row_entry = data['data'][i]
            try:
                row_specific_list = []
                for dict in row_entry:
                    for key in dict.keys():
                        statement = str(key) +" : " + str(dict[key])
                        row_specific_list.append(statement)
                data_entries.append(row_specific_list)
            except Exception as err:
                new_row_entry = row_entry["conversations"]
                row_specific_list = []
                for dict in new_row_entry:
                    for key in dict.keys():
                        statement = str(key) +" : " + str(dict[key])
                        row_specific_list.append(statement)
                data_entries.append(row_specific_list)
    return data_entries

In [5]:
from tqdm import tqdm
def get_summary_list(data, tokenizer, model):
    summary_entries = []
    for index in tqdm(range(0, 550),"Completed"):
        conversations = data[index]
        input_text = " ".join(conversations)
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024).to(device)
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=350,  # Adjust depending on the desired summary length
            min_length=40,   # Minimum length of the summary
            num_beams=4,     # Number of beams for beam search
            length_penalty=2.0
            
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summary_entries.append(summary)
    print(len(summary_entries))
    return summary_entries

In [6]:
data_entries = get_data_in_list(path = "sales_conversations.json")

In [6]:
tokenizer_1 = AutoTokenizer.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary")
model_1 = AutoModelForSeq2SeqLM.from_pretrained("kabita-choudhary/finetuned-bart-for-conversation-summary").to(device)
print(model_1.config)

BartConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "kabita-choudhary/finetuned-bart-for-conversation-summary",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "eos_token_ids": [
    2
  ],
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": tr

In [7]:
finetuned_bart_summary_list = get_summary_list(data = data_entries, tokenizer = tokenizer_1, model = model_1.to(device))

Completed: 100%|██████████| 550/550 [04:53<00:00,  1.87it/s]

550





In [8]:
tokenizer_2 = AutoTokenizer.from_pretrained("utrobinmv/t5_summary_en_ru_zh_base_2048")
model_2 = AutoModelForSeq2SeqLM.from_pretrained("utrobinmv/t5_summary_en_ru_zh_base_2048").to(device)
print(model_2.config)

T5Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "utrobinmv/t5_summary_en_ru_zh_base_2048",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 65100
}



In [9]:
t5_summary_list = get_summary_list(data = data_entries, tokenizer = tokenizer_2, model = model_2.to(device))

Completed: 100%|██████████| 550/550 [05:08<00:00,  1.78it/s]

550





In [1]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer_3 = AutoTokenizer.from_pretrained("Nexusflow/Athene-V2-Chat")
# model_3 = AutoModelForCausalLM.from_pretrained("Nexusflow/Athene-V2-Chat").to(device)
# print(model_3.config)

In [2]:
# v2_summary_list = get_summary_list(data = data_entries, tokenizer = tokenizer_3, model = model_3.to(device))

In [10]:
df = pd.DataFrame({
    "finetuned_bart": finetuned_bart_summary_list,
    "T5": t5_summary_list,
})

In [11]:
csv_file = "output_summary_models.csv"  
df.to_csv(csv_file, index=False)
print(f"DataFrame saved to {csv_file}")

DataFrame saved to output_summary_models.csv


In [16]:
print(df.iloc[0]['finetuned_bart'])

Customer is interested in buying a new smartphone, a laptop and a gaming console. Salesman will help him to choose the right one. Customer is considering upgrading his company's server. Customer wants to buy a new smartwatch and a new printer for his home office.


In [5]:
import pandas as pd
df = pd.read_csv("labels.csv")
print(df.columns)
df.dropna(inplace=True)
ls = df["label"].tolist()
len(ls)

Index(['Index', 'label'], dtype='object')


550

In [6]:
csv_file = "chat_summary_labels.csv"  
df_1 = pd.read_csv("output_summary_models.csv")
df_1["label"] = ls
df_1.to_csv(csv_file, index=False)
print(f"DataFrame saved to {csv_file}")

DataFrame saved to chat_summary_labels.csv
