In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# ✅ Load dataset safely (ignore badly formatted rows)
file_path = '/content/consumer_complaints.csv'

df = pd.read_csv(
    file_path,
    sep=",",
    quotechar='"',
    on_bad_lines='skip',   # skips malformed lines
    engine="python",       # more tolerant parser
    encoding="utf-8"       # ensure encoding
)

print("Dataset shape:", df.shape)
print("Columns:", df.columns)

# ✅ Pick the complaint text and product (target label)
df = df[['consumer_complaint_narrative', 'product']].dropna()

# ✅ Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"\d+", "", text)      # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = text.strip()
    return text

df['clean_text'] = df['consumer_complaint_narrative'].apply(clean_text)

# ✅ Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['product'], test_size=0.2, random_state=42, stratify=df['product']
)

# ✅ Vectorization
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Train model (Logistic Regression for classification)
model = LogisticRegression(max_iter=300)
model.fit(X_train_vec, y_train)

# ✅ Predictions
y_pred = model.predict(X_test_vec)

# ✅ Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Dataset shape: (346632, 18)
Columns: Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id'],
      dtype='object')
Accuracy: 0.8436581161918281

Classification Report:
                          precision    recall  f1-score   support

Bank account or service       0.79      0.79      0.79       813
          Consumer Loan       0.77      0.60      0.67       530
            Credit card       0.80      0.81      0.81      1130
       Credit reporting       0.85      0.85      0.85      1853
        Debt collection       0.82      0.89      0.85      2686
        Money transfers       0.76      0.54      0.63       100
               Mortgage       0.93      0.95      0.94      2227
Other fi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# ============== LOAD DATA SAFELY ===================
file_path = "/content/consumer_complaints.csv"

df = pd.read_csv(
    file_path,
    on_bad_lines='skip',
    low_memory=False
)

# ============== BASIC CLEANING ===================
# We assume 'consumer_complaint_narrative' is text and 'product' is label
df = df[['consumer_complaint_narrative', 'product']].dropna()

X = df['consumer_complaint_narrative'].astype(str)
y = df['product']

# ============== TEXT PREPROCESSING ===================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)   # keep only letters
    text = text.lower().split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

X = X.apply(clean_text)

# ============== TF-IDF VECTORIZATION ===================
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(X)

# ============== FEATURE SELECTION ===================
selector = SelectKBest(chi2, k=3000)
X_selected = selector.fit_transform(X_tfidf, y)

# ============== TRAIN / TEST SPLIT ===================
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y
)

# ============== MODEL OPTIMIZATION ===================
svm = LinearSVC()

param_grid = {
    'C': [0.1, 1, 10],
    'max_iter': [2000, 4000]
}

grid = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=2, scoring='f1_weighted')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_

# ============== EVALUATION ===================
y_pred = best_model.predict(X_test)

print("Best Params:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best Params: {'C': 1, 'max_iter': 2000}
Accuracy: 0.844672006488898

Classification Report:
                          precision    recall  f1-score   support

Bank account or service       0.81      0.78      0.79       813
          Consumer Loan       0.75      0.58      0.65       530
            Credit card       0.79      0.82      0.81      1130
       Credit reporting       0.85      0.86      0.86      1853
        Debt collection       0.83      0.88      0.85      2686
        Money transfers       0.61      0.56      0.58       100
               Mortgage       0.93      0.95      0.94      2227
Other financial service       0.00      0.00      0.00        16
            Payday loan       0.51      0.33      0.40       109
           Prepaid card       0.80      0.47      0.59        68
           Student loan       0.86      0.81      0.84       331

               accuracy                           0.84      9863


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# ==============================
# Optimized Ticket Classification
# ==============================
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from google.colab import files

# ==============================
# 1. Load dataset safely
# ==============================
file_path = '/content/consumer_complaints.csv'
df = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, encoding='utf-8')

# Keep only relevant columns
df = df[['consumer_complaint_narrative', 'product']].dropna()

# ==============================
# 2. Preprocess text
# ==============================
def preprocess(text):
    text = str(text).lower()                       # lowercase
    text = re.sub(r'\d+', '', text)                # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

df['text'] = df['consumer_complaint_narrative'].apply(preprocess)
X = df['text']
y = df['product']

# ==============================
# 3. Train-Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==============================
# 4. TF-IDF Vectorization
# ==============================
vectorizer = TfidfVectorizer(
    max_features=20000,     # optimized features
    ngram_range=(1,2),      # unigrams + bigrams
    stop_words='english'
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ==============================
# 5. Train Optimized Classifier
# ==============================
model = LinearSVC(
    class_weight='balanced',   # handle imbalance
    C=1.0,
    max_iter=3000               # ensure convergence
)
model.fit(X_train_tfidf, y_train)

# ==============================
# 6. Evaluate
# ==============================
y_pred = model.predict(X_test_tfidf)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==============================
# 7. Save & Download Model
# ==============================
joblib.dump((model, vectorizer), "optimized_complaint_classifier.joblib")
files.download("optimized_complaint_classifier.joblib")
print("✅ Model & Vectorizer saved and ready to download!")

✅ Accuracy: 0.8494372908851262

📊 Classification Report:
                          precision    recall  f1-score   support

Bank account or service       0.79      0.78      0.78       813
          Consumer Loan       0.69      0.65      0.67       530
            Credit card       0.80      0.83      0.81      1130
       Credit reporting       0.86      0.87      0.87      1853
        Debt collection       0.86      0.86      0.86      2686
        Money transfers       0.62      0.65      0.64       100
               Mortgage       0.94      0.94      0.94      2227
Other financial service       0.75      0.19      0.30        16
            Payday loan       0.46      0.44      0.45       109
           Prepaid card       0.72      0.65      0.68        68
           Student loan       0.85      0.86      0.85       331

               accuracy                           0.85      9863
              macro avg       0.76      0.70      0.71      9863
           weighted avg       

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Model & Vectorizer saved and ready to download!


In [None]:
# ==============================
# Optimized Ticket Classification (High Accuracy)
# ==============================
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from google.colab import files

# ==============================
# 1. Load dataset safely
# ==============================
file_path = '/content/consumer_complaints.csv'
df = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, encoding='utf-8')

# Keep only relevant columns
df = df[['consumer_complaint_narrative', 'product']].dropna()

# ==============================
# 2. Merge rare classes
# ==============================
class_counts = df['product'].value_counts()
rare_classes = class_counts[class_counts < 50].index
df['product'] = df['product'].replace(rare_classes, 'Other financial service')

# ==============================
# 3. Preprocess text
# ==============================
def preprocess(text):
    text = str(text).lower()                       # lowercase
    text = re.sub(r'\d+', '', text)                # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

df['text'] = df['consumer_complaint_narrative'].apply(preprocess)
X = df['text']
y = df['product']

# ==============================
# 4. Train-Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ==============================
# 5. TF-IDF Vectorization (trigrams)
# ==============================
vectorizer = TfidfVectorizer(
    max_features=30000,    # increase features for accuracy
    ngram_range=(1,3),     # unigrams + bigrams + trigrams
    stop_words='english'
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ==============================
# 6. Train LinearSVC
# ==============================
model = LinearSVC(
    class_weight='balanced',  # handles imbalanced classes
    C=1.0,
    max_iter=4000             # ensure convergence
)
model.fit(X_train_tfidf, y_train)

# ==============================
# 7. Evaluate
# ==============================
y_pred = model.predict(X_test_tfidf)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==============================
# 8. Save & Download Model
# ==============================
joblib.dump((model, vectorizer), "high_accuracy_complaint_classifier.joblib")
files.download("high_accuracy_complaint_classifier.joblib")
print("✅ Model & Vectorizer saved and ready to download!")

✅ Accuracy: 0.8535942411031127

📊 Classification Report:
                          precision    recall  f1-score   support

Bank account or service       0.79      0.79      0.79       813
          Consumer Loan       0.71      0.65      0.68       530
            Credit card       0.80      0.84      0.82      1130
       Credit reporting       0.86      0.87      0.87      1853
        Debt collection       0.86      0.86      0.86      2686
        Money transfers       0.63      0.64      0.64       100
               Mortgage       0.94      0.95      0.95      2227
Other financial service       0.50      0.12      0.20        16
            Payday loan       0.48      0.44      0.46       109
           Prepaid card       0.71      0.59      0.65        68
           Student loan       0.86      0.86      0.86       331

               accuracy                           0.85      9863
              macro avg       0.74      0.69      0.71      9863
           weighted avg       

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Model & Vectorizer saved and ready to download!


Final_ model using support vector machine

In [19]:

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from google.colab import files


file_path = '/content/consumer_complaints.csv'
df = pd.read_csv(file_path, on_bad_lines='skip', low_memory=False, encoding='utf-8')

# Keep only relevant columns
df = df[['consumer_complaint_narrative', 'product']].dropna()

class_counts = df['product'].value_counts()
rare_classes = class_counts[class_counts < 50].index
df['product'] = df['product'].replace(rare_classes, 'Other financial service')

def preprocess(text):
    text = str(text).lower()                       # lowercase
    text = re.sub(r'\d+', '', text)                # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

df['text'] = df['consumer_complaint_narrative'].apply(preprocess)
X = df['text']
y = df['product']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


vectorizer = TfidfVectorizer(
    max_features=30000,    # increase features for accuracy
    ngram_range=(1,3),     # unigrams + bigrams + trigrams
    stop_words='english'
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


model = LinearSVC(
    class_weight='balanced',  # handles imbalanced classes
    C=1.0,
    max_iter=4000             # ensure convergence
)
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🔹 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


joblib.dump((model, vectorizer), "final_accuracy_complaint_classifier.joblib")
files.download("high_accuracy_complaint_classifier.joblib")
print("✅ Model & Vectorizer saved and ready to download!")

✅ Accuracy: 0.8535942411031127

📊 Classification Report:
                          precision    recall  f1-score   support

Bank account or service       0.79      0.79      0.79       813
          Consumer Loan       0.71      0.65      0.68       530
            Credit card       0.80      0.84      0.82      1130
       Credit reporting       0.86      0.87      0.87      1853
        Debt collection       0.86      0.86      0.86      2686
        Money transfers       0.63      0.64      0.64       100
               Mortgage       0.94      0.95      0.95      2227
Other financial service       0.50      0.12      0.20        16
            Payday loan       0.48      0.44      0.46       109
           Prepaid card       0.71      0.59      0.65        68
           Student loan       0.86      0.86      0.86       331

               accuracy                           0.85      9863
              macro avg       0.74      0.69      0.71      9863
           weighted avg       

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Model & Vectorizer saved and ready to download!


In [21]:
import joblib

# Load the trained model and vectorizer
model, vectorizer = joblib.load("high_accuracy_complaint_classifier.joblib")

# Example complaints to test
sample_complaints = [
    "I was charged extra fees on my credit card without notice.",
    "My mortgage application has been delayed for months.",
    "The loan interest rate is much higher than promised.",
    "Unauthorized transactions happened on my bank account."
]

# Vectorize the sample complaints
sample_complaints_tfidf = vectorizer.transform(sample_complaints)

# Predict categories
predictions = model.predict(sample_complaints_tfidf)

# Show results
for text, label in zip(sample_complaints, predictions):
    print(f"Complaint: {text}")
    print(f"Predicted Category: {label}")
    print("-" * 50)

Complaint: I was charged extra fees on my credit card without notice.
Predicted Category: Credit card
--------------------------------------------------
Complaint: My mortgage application has been delayed for months.
Predicted Category: Mortgage
--------------------------------------------------
Complaint: The loan interest rate is much higher than promised.
Predicted Category: Mortgage
--------------------------------------------------
Complaint: Unauthorized transactions happened on my bank account.
Predicted Category: Bank account or service
--------------------------------------------------


In [25]:
import joblib

# Load your trained model
model, vectorizer = joblib.load("/content/final_accuracy_complaint_classifier.joblib")

# Tricky test complaints
test_samples = [
    "The lender keeps calling me about late payments on a loan that I never took, and now it is showing up on my credit report.",
    "I tried to close my savings account, but the bank continued charging me monthly service fees and overdraft charges.",
    "I was approved for a credit card, but the company suddenly lowered my credit limit without explanation, which hurt my credit score.",
    "My mortgage servicer applied my payments to the wrong account, and now I’m being reported as delinquent.",
    "The debt collector is threatening legal action even though I have proof that my loan was already paid off."
]

sample_complaints_tfidf = vectorizer.transform(test_samples)

# Make predictions
predictions = model.predict(sample_complaints_tfidf)

# Display results
for complaint, label in zip(test_samples, predictions):
    print(f"Complaint: {complaint}\nPredicted Category: {label}\n")

Complaint: The lender keeps calling me about late payments on a loan that I never took, and now it is showing up on my credit report.
Predicted Category: Consumer Loan

Complaint: I tried to close my savings account, but the bank continued charging me monthly service fees and overdraft charges.
Predicted Category: Bank account or service

Complaint: I was approved for a credit card, but the company suddenly lowered my credit limit without explanation, which hurt my credit score.
Predicted Category: Credit card

Complaint: My mortgage servicer applied my payments to the wrong account, and now I’m being reported as delinquent.
Predicted Category: Mortgage

Complaint: The debt collector is threatening legal action even though I have proof that my loan was already paid off.
Predicted Category: Debt collection

