In [1]:
import pandas as pd
from io import StringIO

In [2]:
from google.colab import files
uploaded = files.upload()
uploaded

Saving Call_Logs.csv to Call_Logs.csv


{'Call_Logs.csv': b',Logs\n0,"Date: 17/04/2024\nTime: 15:45:37\n\nAgent: Good afternoon, thank you for calling Fresh Fare Meal Kits customer service. My name is Sarah, how can I assist you today?\n\nClient: Hi Sarah, I\'m calling because I received my meal kit delivery yesterday, but unfortunately, one of the ingredients was missing.\n\nAgent: I\'m sorry to hear that! Let me take a look at your account. Can I have your name and order number, please?\n\nClient: Sure, my name is Emma Smith, and my order number is 123456789.\n\nAgent: Thank you, Emma. I see your account here. Could you please specify which ingredient was missing from your delivery?\n\nClient: It was the fresh basil for the pasta dish.\n\nAgent: I apologize for the inconvenience, Emma. We\'ll make sure to get that sorted out for you right away. Would you prefer to have the missing basil sent in your next delivery, or would you like us to reimburse you for the missing ingredient?\n\nClient: It would be great if you could in

In [3]:
#converting byte string to noraml string andcreating a dataframe

decoded_data = uploaded['Call_Logs.csv'].decode('utf-8')

data_io = StringIO(decoded_data)

In [4]:
data = pd.read_csv(data_io)
data.head()

Unnamed: 0.1,Unnamed: 0,Logs
0,0,Date: 17/04/2024\nTime: 15:45:37\n\nAgent: Goo...
1,1,Date: 17/04/2024\nTime: 16:25:45\n\nAgent: Goo...
2,2,Date: 18/04/2024\nTime: 09:15:26\n\nAgent: Goo...
3,3,Date: 18/04/2024\nTime: 12:15:30\n\nAgent: Goo...
4,4,Date: 07/04/2024\nTime: 10:45:52\n\nAgent: Goo...


In [5]:
#Extracting date,time, and coversations from each log
def extract_info(df):
  lines = df['Logs'].split('\n')
  date = lines[0].split(': ')[1]
  time = lines[1].split(': ')[1]
  conv = '\n'.join([line for line in lines[3:] if line!=''])

  return date,time,conv

# apply the funtion to each row of the DataFrame

data[['Date','Time','Conversation']] = data.apply(extract_info,axis=1,result_type='expand')
data.drop(['Logs','Unnamed: 0'],axis=1,inplace=True)
data.head()


Unnamed: 0,Date,Time,Conversation
0,17/04/2024,15:45:37,"Agent: Good afternoon, thank you for calling F..."
1,17/04/2024,16:25:45,"Agent: Good afternoon, thank you for reaching ..."
2,18/04/2024,09:15:26,"Agent: Good morning, you're speaking with Jess..."
3,18/04/2024,12:15:30,"Agent: Good afternoon, and thank you for calli..."
4,07/04/2024,10:45:52,"Agent: Good morning, and thank you for contact..."


In [6]:
!pip install transformers



In [7]:
# Create a pipeline
# we will first summerize and then classify either there is cancellation or not.

from transformers import pipeline



In [8]:
summarizer = pipeline('summarization',
                      model = 'facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
output_list = []

for conv in data['Conversation']:
  summary = summarizer(conv)[0]['summary_text']
  output_list.append(summary)

In [10]:
data['summary'] = output_list
data.head()

Unnamed: 0,Date,Time,Conversation,summary
0,17/04/2024,15:45:37,"Agent: Good afternoon, thank you for calling F...","Client: Hi Sarah, I'm calling because I receiv..."
1,17/04/2024,16:25:45,"Agent: Good afternoon, thank you for reaching ...",Client: I've just received my delivery and the...
2,18/04/2024,09:15:26,"Agent: Good morning, you're speaking with Jess...","Client: Hi Jessica, I'm calling because I've b..."
3,18/04/2024,12:15:30,"Agent: Good afternoon, and thank you for calli...","Client: I received my meal kit delivery today,..."
4,07/04/2024,10:45:52,"Agent: Good morning, and thank you for contact...","Client: I received my delivery yesterday, but ..."


In [11]:
# Setting up the classification pipeline

classifier = pipeline("zero-shot-classification",
                      model = 'facebook/bart-large-mnli')

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [12]:
# Setting the labels
labels = ["cancellation",'other']

In [16]:
#create a list for the outcomes

classification = []

for conv in data['Conversation']:
  classification.append(classifier(conv,labels)['labels'][0])

In [18]:
# creating column for dataset
data["Cancellation"] = [True if cls == 'cancellation' else False for cls in classification]

In [19]:
data.head()

Unnamed: 0,Date,Time,Conversation,summary,Cancellation
0,17/04/2024,15:45:37,"Agent: Good afternoon, thank you for calling F...","Client: Hi Sarah, I'm calling because I receiv...",False
1,17/04/2024,16:25:45,"Agent: Good afternoon, thank you for reaching ...",Client: I've just received my delivery and the...,False
2,18/04/2024,09:15:26,"Agent: Good morning, you're speaking with Jess...","Client: Hi Jessica, I'm calling because I've b...",True
3,18/04/2024,12:15:30,"Agent: Good afternoon, and thank you for calli...","Client: I received my meal kit delivery today,...",False
4,07/04/2024,10:45:52,"Agent: Good morning, and thank you for contact...","Client: I received my delivery yesterday, but ...",False


In [17]:
# lets know the reasons for cancellation by inferencing FLan T5

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [21]:
# Setting up the mode

model = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(model)
model = T5ForConditionalGeneration.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [23]:
# Build a function to infer the cancellation reasons

def cancellation_reason(df):
  if df['Cancellation']==False:
    return 'None'

  else:
    #prompt the model

    prompt = f"""
    {df['Conversation']}
    What are the issues that led the client to cancel their subscription?
    """

    # Convert the prompts into the tokens, so that we can feed into the model
    input = tokenizer(prompt,tensors='pt').input_ids
    output = model.generate(input,max_new_tokens=50,min_length=20)
    return tokenizer.decode(output[0],skip_special_tokens=True)

# Apply the function

data['Cancellation_reason'] = data.apply(cancellation_reason,axis=1)


Keyword arguments {'tensors': 'pt'} not recognized.


AttributeError: 'list' object has no attribute 'shape'

In [24]:
#Build a function to infer the cancellation reasons
def cancellation_reason(df):
  if df['Cancellation'] == False:
    return 'None'
  else:
    # prompt the model
    prompt = f"""
    {df['Conversation']}

    What are the issues that led the client to cancel their subscription?

    """

    # Convert the prompt into tokens, feed it to the model
    input = tokenizer(prompt, return_tensors = "pt").input_ids
    output = model.generate(input, max_new_tokens = 50, min_length = 20)
    return tokenizer.decode(output[0], skip_special_tokens = True)

In [25]:
data['Cancellation_reason'] = data.apply(cancellation_reason,axis=1)

In [26]:
data.head()

Unnamed: 0,Date,Time,Conversation,summary,Cancellation,Cancellation_reason
0,17/04/2024,15:45:37,"Agent: Good afternoon, thank you for calling F...","Client: Hi Sarah, I'm calling because I receiv...",False,
1,17/04/2024,16:25:45,"Agent: Good afternoon, thank you for reaching ...",Client: I've just received my delivery and the...,False,
2,18/04/2024,09:15:26,"Agent: Good morning, you're speaking with Jess...","Client: Hi Jessica, I'm calling because I've b...",True,The client's last three deliveries have been l...
3,18/04/2024,12:15:30,"Agent: Good afternoon, and thank you for calli...","Client: I received my meal kit delivery today,...",False,
4,07/04/2024,10:45:52,"Agent: Good morning, and thank you for contact...","Client: I received my delivery yesterday, but ...",False,
