In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd

file_path = '/content/drive/My Drive/complaints.csv'

# Load only required columns
columns = ['Product', 'Consumer complaint narrative']
df = pd.read_csv(file_path, usecols=columns, low_memory=False)

# Drop missing values
df = df.dropna()

# Filter 4 target categories
categories = [
    'Credit reporting or other personal consumer reports',
    'Debt collection',
    'Consumer Loan',
    'Mortgage'
]
df = df[df['Product'].isin(categories)]

# Sample 100,000 rows for speed
df = df.sample(n=100000, random_state=42).reset_index(drop=True)

# Preview
df.head()


Unnamed: 0,Product,Consumer complaint narrative
0,Debt collection,This complaint involves unlawful third-party c...
1,Debt collection,Phoenix Financial sent a letter on a debt this...
2,Debt collection,"While XXXX from XXXX XXXX XX/XX/year>, my iden..."
3,Mortgage,XX/XX/XXXX received letter from Huntington Ban...
4,Credit reporting or other personal consumer re...,Im submitting a complaint to you today to info...


In [9]:
df.columns = ['Category', 'Text', 'Label']



In [10]:
df.columns = ['Category', 'Text', 'Label']
df.head()


Unnamed: 0,Category,Text,Label
0,Debt collection,This complaint involves unlawful third-party c...,1
1,Debt collection,Phoenix Financial sent a letter on a debt this...,1
2,Debt collection,"While XXXX from XXXX XXXX XX/XX/year>, my iden...",1
3,Mortgage,XX/XX/XXXX received letter from Huntington Ban...,3
4,Credit reporting or other personal consumer re...,Im submitting a complaint to you today to info...,0


In [12]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Load stopwords just once
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# ✅ Apply the optimized function
df['Clean_Text'] = df['Text'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['Clean_Text']
y = df['Label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_vec, y_train)
pred_lr = lr.predict(X_test_vec)
print("Logistic Regression:\n", classification_report(y_test, pred_lr))

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
pred_nb = nb.predict(X_test_vec)
print("Naive Bayes:\n", classification_report(y_test, pred_nb))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_vec, y_train)
pred_rf = rf.predict(X_test_vec)
print("Random Forest:\n", classification_report(y_test, pred_rf))


Logistic Regression:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95     13752
           1       0.87      0.81      0.84      4363
           2       0.75      0.31      0.44       124
           3       0.93      0.93      0.93      1761

    accuracy                           0.92     20000
   macro avg       0.87      0.75      0.79     20000
weighted avg       0.92      0.92      0.92     20000

Naive Bayes:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94     13752
           1       0.80      0.77      0.79      4363
           2       0.75      0.02      0.05       124
           3       0.85      0.95      0.90      1761

    accuracy                           0.90     20000
   macro avg       0.83      0.67      0.67     20000
weighted avg       0.90      0.90      0.90     20000

Random Forest:
               precision    recall  f1-score   support

           0       0.95

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Map label numbers back to category names
reverse_map = {
    0: 'Credit reporting or other personal consumer reports',
    1: 'Debt collection',
    2: 'Consumer Loan',
    3: 'Mortgage'
}

def predict_category(text):
    clean = preprocess(text)
    vec = vectorizer.transform([clean])
    pred = lr.predict(vec)[0]
    return reverse_map[pred]

#  Now try prediction
predict_category("My mortgage company is charging late fees I never agreed to.")


'Mortgage'