In [12]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

In [32]:
from tqdm import tqdm
import logging
import time

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [45]:
# Load the data
calls_df = pd.read_csv('calls.csv')
test_df = pd.read_csv('test.csv')
sentiment_df = pd.read_csv('sentiment_statistics.csv')

In [46]:
calls_df = pd.merge(calls_df, sentiment_df, on='call_id', how='left')

In [47]:
calls_df.head()

Unnamed: 0,call_id,customer_id,agent_id_x,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,agent_id_y,agent_tone,customer_tone,average_sentiment,silence_percent_average
0,4667960400,2033123310,963118,7/31/2024 23:56,8/1/2024 0:03,8/1/2024 0:34,\n\nAgent: Thank you for calling United Airlin...,963118,neutral,angry,-0.04,0.39
1,1122072124,8186702651,519057,8/1/2024 0:03,8/1/2024 0:06,8/1/2024 0:18,\n\nAgent: Thank you for calling United Airlin...,519057,calm,neutral,0.02,0.35
2,6834291559,2416856629,158319,7/31/2024 23:59,8/1/2024 0:07,8/1/2024 0:26,\n\nAgent: Thank you for calling United Airlin...,158319,neutral,polite,-0.13,0.32
3,2266439882,1154544516,488324,8/1/2024 0:05,8/1/2024 0:10,8/1/2024 0:17,\n\nAgent: Thank you for calling United Airlin...,488324,neutral,frustrated,-0.2,0.2
4,1211603231,5214456437,721730,8/1/2024 0:04,8/1/2024 0:14,8/1/2024 0:23,\n\nAgent: Thank you for calling United Airlin...,721730,neutral,polite,-0.05,0.35


In [52]:
call_reasons = [
    'Voluntary cancel', 'Booking', 'Irrops', 'Upgrade', 'Seating',
    'Mileage plus', 'Checkout', 'Voluntary change', 'Post flight',
    'Check in', 'Other topics', 'Communications', 'Schedule change',
    'Products & services', 'Digital support', 'Disability',
    'Unaccompanied minor', 'Baggage', 'Traveler updates', 'Etc',
    'Post-flight', 'Check-in', 'Products and services'
]
reason_set = set(call_reasons)
call_ids = test_df['call_id'].iloc[::-1].iloc[:2189].tolist()
batch_size = 64

In [39]:
from datasets import Dataset
from huggingface_hub import HfApi

REPO_ID = ""
DATASET_NAME = "call_predictions"
predictions = []

In [63]:
from huggingface_hub import HfApi
import os

api = HfApi()

def upload_to_hf(predictions, dataset_name):
    dataset = pd.DataFrame(predictions)
    dataset.to_csv(dataset_name + '.csv', index=False)
    print(f"Uploaded {len(predictions)} predictions to Hugging Face dataset: {REPO_ID}/{dataset_name}")
    file_path = 'RR' + dataset_name + '.csv'
    api.upload_file(
        path_in_repo=file_path,
        path_or_fileobj=dataset_name + '.csv',
        repo_id=REPO_ID,
        repo_type="dataset"
    )
    os.remove(dataset_name + '.csv')
    print(f"File {file_path} uploaded to huggingface")


In [40]:
# Load the model and tokenizer
model_name = '/kaggle/input/llama-3.2/transformers/3b-instruct/1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True, max_new_tokens=50, batch_size=5)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [50]:
def predict_call_reason(call_id):
    call_data = calls_df[calls_df["call_id"] == call_id].iloc[0]

    prompt = f"""
    I will give you a call transcript with some details and you determine the primary reason for the call from the given list of categories
    Categories:
    {', '.join(call_reasons)}
    Answer just ONE category from the list provided and NOTHING else. If you cannot determine the reason, respond with 'Unknown'.
    """

    messages = [
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": "Got it please provide the details"},
        {
            "role": "user",
            "content": f"""Customer Tone: {call_data['customer_tone']}
Agent Tone: {call_data['agent_tone']}
Average Sentiment: {call_data['average_sentiment']}
Transcript:
{' '.join(call_data['call_transcript'].split()[:300])}""",
        },
    ]
    decoded = pipe(messages, pad_token_id=pipe.tokenizer.eos_token_id)
    response = decoded[0]["generated_text"][-1]["content"]
    first_line = response.strip().split("\n")[0]
    predicted_reason = "Unknown"
    for category in call_reasons + ["Unknown"]:
        if category.lower() in first_line.lower():
            predicted_reason = category

#     print(f"CATEGORY: {predicted_reason} {response}")

    if predicted_reason not in call_reasons and predicted_reason != "Unknown":
        predicted_reason = "Unknown"

    return predicted_reason

In [42]:
def predict_batch(call_ids, batch_size=32):
    return [predict_call_reason(call_id) for call_id in call_ids]

In [64]:
with tqdm(total=len(call_ids), desc="Predicting") as pbar:
    for i in range(0, len(call_ids), batch_size):
        batch = call_ids[i:i+batch_size]
        batch_predictions = predict_batch(batch, batch_size)
        new_predictions = [{'call_id': cid, 'primary_call_reason': reason} 
                           for cid, reason in zip(batch, batch_predictions)]
        predictions.extend(new_predictions)
        pbar.update(len(batch))

        upload_to_hf(predictions, f"{DATASET_NAME}_{len(predictions)}")

# Final upload to Hugging Face
upload_to_hf(predictions, DATASET_NAME)


Predicting:   0%|          | 0/5157 [00:00<?, ?it/s]

CATEGORY: Booking Booking
CATEGORY: Products & services Products & services
CATEGORY: Other topics Other topics
CATEGORY: Booking Booking
CATEGORY: Booking Booking
CATEGORY: Products & services Products & services
CATEGORY: Booking Booking
CATEGORY: Products & services Products & services
CATEGORY: Products & services Products & services


Predicting:   0%|          | 10/5157 [00:15<2:09:10,  1.51s/it]

CATEGORY: Products & services Products & services
Uploaded 80 predictions to Hugging Face dataset: korigamik/call_predictions/call_predictions_80
File RRcall_predictions_80.csv uploaded to huggingface
CATEGORY: Products & services Products & services
CATEGORY: Products & services Products & services


Predicting:   0%|          | 10/5157 [00:19<2:50:55,  1.99s/it]

KeyboardInterrupt



In [None]:
end_time = time.time()
logging.info(f"Predictions completed in {end_time - start_time:.2f} seconds")

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame(predictions)

# Save the predictions to a CSV file
output_file = 'test_predictions.csv'
predictions_df.to_csv(output_file, index=False)
logging.info(f"Predictions saved to '{output_file}'")

# Log some statistics
reason_counts = predictions_df['primary_call_reason'].value_counts()
logging.info("Prediction distribution:")
for reason, count in reason_counts.items():
    logging.info(f"{reason}: {count} ({count/len(predictions_df)*100:.2f}%)")

logging.info("Process completed successfully.")