In [None]:
!pip install spacy-sentence-bert
!pip install scikit-learn

In [17]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

In [18]:
nlp = spacy.load("en_core_web_sm")

#### Steps to add to last manipulation
1. Governance and data Management
- Governance: user name data masking
  
2. Data Overview and Preprocessing
- use labels value distribution to determine if imbalanced class if yes what to do about it (random oversampling?)
- Add default class, dataset augmentation or default class predict based on confidence threshold
- Consitent labelling: one class for loan, credit debit card,
- rename some labels
- use scatter to show data distribution before and after smote
  
3. Model Evaluation
- Model evaluation: add confusion matrix
- augment the number(4) of models type to cross evaluate performance, display all their confusion matrix in one plot inline
- add comparison of model performance with original and augmented datasets ?

4. Retry clean
- use en_md model: it has word vector for semantic similarity

In [6]:
datapath = "./datasets/banking_csr_conversation.csv"
df = pd.read_csv(datapath)

In [10]:
df.columns = ["rowid", "interaction", "label"]
df["anonymized_interaction"] = ''

In [43]:
# df.shape
# df = df.drop(['questions', 'anonymized'], axis=1)
# df["anonymized_interaction"] = ''
# df.head(2)

Unnamed: 0,rowid,interaction,label,anonymized_interaction
0,5c69b568-4dd6-41c6-903a-90ab47256acf,"Hi Sarah, my name is Alice. What are the inter...",Loan,
1,a79cec64-9bd7-49a0-9657-81c80fd5ea4c,"Hi Sarah, my name is David, How long does it t...",Loan,


In [35]:
samsung_lines = df[df['interaction'].str.contains('Samsung', case=False, na=False)]
for line in samsung_lines['interaction']:
    print(line)

Hi Sarah, my name is Alice. Can I link my debit card to a digital wallet, such as Samsung Pay or PayPal? Is there a chance I could obtain some particulars from you?
Hi Sarah, my name is Alice. Hi, I want to know if I can link my debit card to a digital wallet like Samsung Pay or Garmin Pay. Can you provide me with some information?


In [33]:
testdoc = nlp("This is a Samsung.")
processed = [(ent.text, ent.label_) for ent in testdoc.ents]
print(processed)

[('Samsung', 'ORG')]


### Helper methods

In [82]:
def anonymize_names_v2(doc):
    text = doc.text
    replacements = {}
    name_id = 1

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            replacements[(ent.start_char, ent.end_char)] = f"[NAME_{name_id}]"
            name_id += 1

    # Sort from last to first to avoid offset shifting
    new_text = text
    for (start, end), placeholder in sorted(replacements.items(), reverse=True):
        new_text = new_text[:start] + placeholder + new_text[end:]

    return new_text, replacements

def deanonymize_text(text, mapping):
    reverse_mapping = {v: k for k, v in mapping.items()}
    for anon, original in reverse_mapping.items():
        text = text.replace(anon, original)
    return text
    # revert back code 
    # Add a column with the reverted text
    # df['deanonymized'] = df.apply(lambda row: deanonymize_text(row['anonymized'], row['name_mapping']), axis=1)

def text_topic_similarity(text, topic):
    doc1 = nlp(text)
    doc2 = nlp(topic)
    
    return doc1.similarity(doc2)

In [81]:
# texts = df['interaction'].tolist()
# docs = list(nlp.pipe(texts, batch_size=50))
# df["anonymized_interaction"] = [anonymize_names(doc) for doc in nlp.pipe(df['interaction'], batch_size=50)]
# df["anonymized_interaction"] = [anonymize_names_v2(doc) for doc in nlp.pipe(df['interaction'], batch_size=50)]

# Apply to DataFrame and store mappings
anonymized_results = []
replacement_maps = []

for doc in nlp.pipe(df['interaction'], batch_size=50):
    anonymized_text, mapping = anonymize_names_v2(doc)
    anonymized_results.append(anonymized_text)
    replacement_maps.append(mapping)

# Store results
df['anonymized_interaction'] = anonymized_results
df['name_mapping'] = replacement_maps

In [52]:
df.head(2)

Unnamed: 0,rowid,interaction,label,anonymized_interaction,name_mapping
0,5c69b568-4dd6-41c6-903a-90ab47256acf,"Hi Sarah, my name is Alice. What are the inter...",Loan,"Hi [NAME_1], my name is [NAME_2]. What are the...","{(3, 8): '[NAME_1]', (21, 26): '[NAME_2]'}"
1,a79cec64-9bd7-49a0-9657-81c80fd5ea4c,"Hi Sarah, my name is David, How long does it t...",Loan,"Hi [NAME_1], my name is [NAME_2], How long doe...","{(3, 8): '[NAME_1]', (21, 26): '[NAME_2]'}"
2,a8041599-b4bd-4adf-b490-cdf9b0498cb5,"Hi Sarah, my name is Emily, What documents do...",Loan,"Hi [NAME_1], my name is Emily, What documents...","{(3, 8): '[NAME_1]'}"
3,41408428-a2f5-4516-8272-f99cdb5225d8,"Hi Sarah, my name is Michael, Can I get a loa...",Loan,"Hi [NAME_1], my name is [NAME_2], Can I get a...","{(3, 8): '[NAME_1]', (21, 28): '[NAME_2]'}"
4,b457f4b4-9060-4f05-b34f-61f41d003a9f,"Hi Sarah, my name is Alice. Are there any upfr...",Loan,"Hi [NAME_1], my name is [NAME_2]. Are there an...","{(3, 8): '[NAME_1]', (21, 26): '[NAME_2]'}"


### Value distribution and label processing

In [55]:
df_anonymized = df[["rowid", "anonymized_interaction", "label"]]
# df_anonymized.head(3)

In [66]:
overview = df_anonymized[df_anonymized['label']=="Sales"]

In [71]:
#overview

In [69]:
overview = overview.head(5)
for line in overview["anonymized_interaction"]:
    print(line)

Hi [NAME_1], my name is [NAME_2], Hello, I want to apply for a credit card with rewards for beauty and wellness purchases. What are my choices? Might you be able to supply me with the necessary information?
Hi [NAME_1], my name is [NAME_2], I'm looking for a credit card with no annual fee. Are there any cards on sale that match my requirement? I'm interested in obtaining some data. Could you assist me?
Hi [NAME_1], my name is Emily,  I'm interested in a credit card that offers bonus points. Are there any special promotions on bonus rewards? Is there a chance I could obtain some particulars from you?
Hi [NAME_1], my name is [NAME_2],  I want a credit card with exclusive discounts. Are there any sales or deals on cards with discount benefits? I'm interested in obtaining some data. Could you assist me?
Hi [NAME_1], my name is [NAME_2]. I want a credit card that offers rewards on grocery purchases. Are there any sales or promotions on cards with grocery rewards? Would it be possible for yo

In [75]:
## proper label setting
topics = ['Account Details', 'New Account', 'Account Inquiries', 'Product Inquiries', 
          'Debit/Credit Card Issues', 'Transactions & Payments',  'Loan and Credit Services', 
          'Fraud and Security Concerns','Fees and Charges','General Financial Advice',
          'Appointments', 'Technical Support', 'Escalations', 'Complaints', 'Other']

df_anonymized = df_anonymized.replace({'label':
    {
        "AccountDetails":"Account Details",
        "Loan":"Loan and Credit Services","HomeLoan":"Loan and Credit Services","AutoLoan":"Loan and Credit Services",
        "DebitCard":"Debit/Credit Card","CreditCard":"Debit/Credit Card",
        "Offers":"Product Inquiries",
        "Grievances":"Complaints",
        "TechSupport":"Technical Support"
    }
})

In [78]:
#df_anonymized["label"].value_counts()
df_anonymized["original_label_confidence"] = 0
df_anonymized["new_label"] = ""
df_anonymized["new_label_confidence"] = 0

In [83]:
df_anonymized.head(2)

Unnamed: 0,rowid,anonymized_interaction,label,original_label_confidence,new_label,new_label_confidence
0,5c69b568-4dd6-41c6-903a-90ab47256acf,"Hi [NAME_1], my name is [NAME_2]. What are the...",Loan and Credit Services,0,,0
1,a79cec64-9bd7-49a0-9657-81c80fd5ea4c,"Hi [NAME_1], my name is [NAME_2], How long doe...",Loan and Credit Services,0,,0


In [88]:
df_anonymized["label_doc"] = df_anonymized["label"].apply(nlp)
df_anonymized["original_label_confidence"] = df_anonymized.apply(
    lambda row: text_topic_similarity(row["anonymized_interaction"], row["label_doc"]) * 100 , axis=1
)

  return doc1.similarity(doc2)


In [87]:
df_anonymized[["label", "label_doc", "original_label_confidence"]]

Unnamed: 0,label,label_doc,original_label_confidence
0,Loan and Credit Services,"(Loan, and, Credit, Services)",0.078143
1,Loan and Credit Services,"(Loan, and, Credit, Services)",0.025535
2,Loan and Credit Services,"(Loan, and, Credit, Services)",0.004194
3,Loan and Credit Services,"(Loan, and, Credit, Services)",-0.050527
4,Loan and Credit Services,"(Loan, and, Credit, Services)",0.024130
...,...,...,...
1583,Complaints,(Complaints),0.243543
1584,Complaints,(Complaints),0.135744
1585,Complaints,(Complaints),0.106983
1586,Complaints,(Complaints),0.079432


In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
model = SentenceTransformer("all-MiniLM-L6-v2")  

In [None]:
# List of class labels
topics = ['Account Details', 'New Account', 'Account Inquiries', 'Product Inquiries', 
          'Debit/Credit Card Issues', 'Transactions & Payments',  'Loan and Credit Services', 
          'Fraud and Security Concerns','Fees and Charges','General Financial Advice',
          'Appointments', 'Technical Support', 'Escalations', 'Complaints', 'Other']

# Precompute embeddings
topics_embeddings = model.encode(topics, convert_to_tensor=True)
text_inputs = df_anonymized["anonymized_interaction"].tolist()
text_embeddings = model.encode(text_inputs, convert_to_tensor=True)

# Store results
predicted_labels = []
confidence_scores = []

for text_emb in tqdm(text_embeddings):
    similarities = util.cos_sim(text_emb, topics_embeddings)
    best_idx = similarities.argmax().item()
    predicted_labels.append(topics[best_idx])
    confidence_scores.append(similarities[0][best_idx].item())

# Add results to the dataframe
df_anonymized["new_label"] = predicted_labels
df_anonymized["new_label_confidence"] = confidence_scores