In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import shuffle  # Importing shuffle to randomize the dataset

# Assuming 'https://www.consumerfinance.gov/data-research/consumer-complaints/' is the correct dataset URL
dataset_url = 'https://files.consumerfinance.gov/ccdb/complaints.csv.zip'

df = pd.read_csv(dataset_url, compression='zip')  # Load the dataset

print(df.head())
print(df.info())
print(df.describe())

df.isnull().sum()
df.dropna(subset=['product', 'complaint_narrative'], inplace=True)  # Drop rows with missing values in relevant columns
df = shuffle(df)  # Shuffle the dataset for randomness

sns.histplot(df["product"])
sns.countplot(x="product", data=df)

X = df["complaint_narrative"]

def clean_text(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    stop_words = set(stopwords.words("english"))
    text = " ".join([word for word in text.split() if word not in stop_words])

    return text

X = X.apply(clean_text)

vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)

y = df["product"]
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

new_complaint = "I'm having trouble with my mortgage lender..."
new_complaint_vec = vectorizer.transform([clean_text(new_complaint)])  # Clean new text
predicted_category = model.predict(new_complaint_vec)[0]
print("Predicted category:", predicted_category)

  df = pd.read_csv(dataset_url, compression='zip')  # Load the dataset


  Date received                                            Product  \
0    2023-12-13  Credit reporting or other personal consumer re...   
1    2023-12-12  Credit reporting or other personal consumer re...   
2    2023-11-15  Credit reporting or other personal consumer re...   
3    2023-11-15  Credit reporting or other personal consumer re...   
4    2023-12-12  Credit reporting or other personal consumer re...   

        Sub-product                                 Issue  \
0  Credit reporting           Improper use of your report   
1  Credit reporting  Incorrect information on your report   
2  Credit reporting           Improper use of your report   
3  Credit reporting           Improper use of your report   
4  Credit reporting  Incorrect information on your report   

                                           Sub-issue  \
0  Credit inquiries on your report that you don't...   
1                      Account information incorrect   
2  Credit inquiries on your report that you 

KeyError: ['product', 'complaint_narrative']

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
