In [None]:
# installing the main libraries
!pip install pandas tqdm kaggle transformers torch --quiet

In [None]:
# Import all libraries
import os
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize
from transformers import pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# move kaggle.json to the correct folder
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the bank customer complaint dataset
!kaggle datasets download -d adhamelkomy/bank-customer-complaint-analysis

Dataset URL: https://www.kaggle.com/datasets/adhamelkomy/bank-customer-complaint-analysis
License(s): CC0-1.0
Downloading bank-customer-complaint-analysis.zip to /content
  0% 0.00/20.0M [00:00<?, ?B/s]
100% 20.0M/20.0M [00:00<00:00, 286MB/s]


In [None]:
#unzipping the dataset
!unzip bank-customer-complaint-analysis.zip

Archive:  bank-customer-complaint-analysis.zip
  inflating: Bank Customer Complaint Analysis for Efficient Dispute Resolution.ipynb  
  inflating: complaints.csv          
  inflating: complaints_report_20240226_183305.txt  
  inflating: final_dataframe (1).csv  


In [None]:
# Load and clean data

df = pd.read_csv('/content/complaints.csv')
print(df.columns)

df.rename(columns={'narrative': 'complaint_text'}, inplace=True)

# Drop missing complaints
df = df.dropna(subset=['complaint_text'])
print("Total complaints:", len(df))

df.head(3)


Index(['Unnamed: 0', 'product', 'narrative'], dtype='object')
Total complaints: 162411


Unnamed: 0.1,Unnamed: 0,product,complaint_text
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...


In [None]:
# Clean and shorten complaint text before sending it to the summarizer

def clean_and_shorten(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", " ", text)
    text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text)
    text = re.sub(r"\s+", " ", text).strip()

    # split into sentences and keep only the first few
    sentences = sent_tokenize(text)
    text = " ".join(sentences[:4])
    return text

In [None]:
# Load the summarization model from Hugging Face

summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum", device_map="auto")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# Generate concise summaries from complaint text using the trained summarization model

def generate_summary(text):
    try:
        text = text[:800]
        summary = summarizer(
            text,
            max_length=30,
            min_length=8,
            do_sample=False,
            early_stopping=True
        )[0]["summary_text"]
        return summary.strip()
    except Exception as e:
        return f"Error: {e}"


In [None]:
# Detect basic sentiment (Positive, Negative, Neutral) using simple keyword matching

def detect_sentiment(text):

    text = text.lower()
    if any(word in text for word in ["angry", "upset", "bad", "refund", "problem", "issue", "delay"]):
        return "Negative"
    elif any(word in text for word in ["happy", "satisfied", "thank", "good", "appreciate"]):
        return "Positive"
    else:
        return "Neutral"


In [None]:
# Suggest a follow-up action based on the detected sentiment

def suggest_action(sentiment):

    if sentiment == "Negative":
        return "Apologize and offer resolution or refund."
    elif sentiment == "Positive":
        return "Acknowledge and thank the customer."
    else:
        return "Review and respond as appropriate."

In [74]:
# Run summarization and sentiment detection

tqdm.pandas()

sample_df = df.sample(10, random_state=42).reset_index(drop=True)

# Generate summaries
sample_df["Summary"] = sample_df["complaint_text"].progress_apply(generate_summary)

# Polish summaries for readability
def polish_summary(summary):
    summary = summary.strip().capitalize()
    if not summary.endswith('.'):
        summary += '.'
    return summary

sample_df["Summary"] = sample_df["Summary"].apply(polish_summary)

# Detect sentiment
sample_df["Sentiment"] = sample_df["complaint_text"].progress_apply(detect_sentiment)

# Suggest actions
sample_df["Suggested Action"] = sample_df["Sentiment"].progress_apply(suggest_action)


 40%|████      | 4/10 [00:54<01:13, 12.33s/it]Your max_length is set to 30, but your input_length is only 17. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=8)
 80%|████████  | 8/10 [01:18<00:14,  7.07s/it]Your max_length is set to 30, but your input_length is only 21. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
100%|██████████| 10/10 [01:30<00:00,  6.39s/it]Your max_length is set to 30, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
100%|██████████| 10/10 [01:35<00:00,  9.58s/it]
100%|██████████| 10/10 [00:00<00:00, 14174.73it/s]
100%|██████████| 10

In [75]:
# Display final output

sample_df[["complaint_text", "Summary", "Sentiment", "Suggested Action"]]


Unnamed: 0,complaint_text,Summary,Sentiment,Suggested Action
0,called acima ask bill hold payment told date p...,Bank stopped paying acima because she owes money.,Neutral,Review and respond as appropriate.
1,sent letter regarding inaccurate unknown thing...,The person who sent a letter regarding inaccur...,Neutral,Review and respond as appropriate.
2,purchased new vehicle alabama purchase price n...,Alama bought a new vehicle and shopped loan se...,Neutral,Review and respond as appropriate.
3,contacted many time failed provide necessary d...,The failure to provide documents to prove acco...,Neutral,Review and respond as appropriate.
4,cancellation trip emailed stating charge done ...,Citibank refused to give credit file claim to ...,Negative,Apologize and offer resolution or refund.
5,received bill immediately called number statem...,"According to the bill, the account was never d...",Positive,Acknowledge and thank the customer.
6,one late payment payment car lease reason paym...,There is a late payment charge for a car lease...,Neutral,Review and respond as appropriate.
7,reliacard money since wont release verified ac...,Reliacard money since they won't release verif...,Positive,Acknowledge and thank the customer.
8,writing delete following information file item...,The federal trade commission asks you to delet...,Neutral,Review and respond as appropriate.
9,never activity account another account consist...,There is a customer agreement called ask get t...,Neutral,Review and respond as appropriate.


In [76]:
# Display the first few complaints with their generated summaries
for i in range(3):
    print(f"\nComplaint {i+1}: {sample_df['complaint_text'][i]}")
    print(f"Summary {i+1}: {sample_df['Summary'][i]}")



Complaint 1: called acima ask bill hold payment told date put block account needed call restart payment send mail sent eft bank amount told would plus bank fee kept going follows ask bank stop acima said could called asima told didnt attempt get money time owe bank fee collection fee nonsufficient fund fraud acima fraud maintenance fee acima fraud nonsufficient fund fraud acima fraud acima fraud nonsufficient fund fraud overdraft fee fraud acima fraud also another collection fee bank
Summary 1: Bank stopped paying acima because she owes money.

Complaint 2: sent letter regarding inaccurate unknown thing credit report day day later received response yet feel like taken advantage ignored dispute section plainly stated failure investigate item within day give reason immediately remove item credit report day deleted promptly demand account deleted immediately file litigation due stress caused information also impacted data breach may got hand wrong person
Summary 2: The person who sent a 

In [77]:
# Save results
sample_df.to_csv("complaint_summaries.csv", index=False)

In [78]:
data = pd.read_csv('/content/complaint_summaries.csv')

In [79]:
data.head()

Unnamed: 0.1,Unnamed: 0,product,complaint_text,Summary,Sentiment,Suggested Action
0,114592,mortgages_and_loans,called acima ask bill hold payment told date p...,Bank stopped paying acima because she owes money.,Neutral,Review and respond as appropriate.
1,108967,credit_reporting,sent letter regarding inaccurate unknown thing...,The person who sent a letter regarding inaccur...,Neutral,Review and respond as appropriate.
2,104282,mortgages_and_loans,purchased new vehicle alabama purchase price n...,Alama bought a new vehicle and shopped loan se...,Neutral,Review and respond as appropriate.
3,53752,debt_collection,contacted many time failed provide necessary d...,The failure to provide documents to prove acco...,Neutral,Review and respond as appropriate.
4,44725,credit_card,cancellation trip emailed stating charge done ...,Citibank refused to give credit file claim to ...,Negative,Apologize and offer resolution or refund.


In [80]:
# This function takes any new complaint and returns the summary, sentiment and the suggested action required

def predict_complaint_analysis(text):
    clean_text = clean_and_shorten(text)
    summary = generate_summary(clean_text)
    sentiment = detect_sentiment(clean_text)
    action = suggest_action(sentiment)
    return {
        "Summary": summary,

        "Sentiment": sentiment,

        "Suggested Action": action
    }

In [81]:
#Let's summarise new customer complains

new_text = "I recently applied for a personal loan, and while the application process seemed fine at first, I noticed that my account was charged twice for the same payment. I contacted customer support immediately, but it’s been several days and I still haven’t received any refund or proper response. This double charge has caused unnecessary stress and inconvenience, and I’d really like this issue resolved as soon as possible"
print(predict_complaint_analysis(new_text))


{'Summary': "My account was charged twice for the same payment. I haven't received any refund or proper response from the customer support.", 'Sentiment': 'Negative', 'Suggested Action': 'Apologize and offer resolution or refund.'}


#Thankyou