Датасет: US Consumer Finance Complaints
https://www.kaggle.com/datasets/kaggle/us-consumer-finance-complaints

In [24]:
!pip install pandas matplotlib seaborn

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import files
uploaded = files.upload()

filename = list(uploaded.keys())[0]

chunk_size = 10  # берем каждую 10-ю запись
skip_rows = lambda x: x % chunk_size != 0  # пропускаем все, кроме каждой 10-й

df = pd.read_csv(filename, skiprows=skip_rows)




Saving consumer_complaints.csv to consumer_complaints (6).csv


In [25]:
correct_column_name = 'product'

selected_products = [
    'Credit reporting', 'Debt collection', 'Mortgage',
    'Credit card', 'Bank account or service'
]

filtered_df = df[df[correct_column_name].isin(selected_products)].copy()
print(f"Отфильтровано строк: {len(filtered_df)} из {len(df)}")

Отфильтровано строк: 50850 из 55595


In [26]:
filtered_df.columns

Index(['date_received', 'product', 'sub_product', 'issue', 'sub_issue',
       'consumer_complaint_narrative', 'company_public_response', 'company',
       'state', 'zipcode', 'tags', 'consumer_consent_provided',
       'submitted_via', 'date_sent_to_company', 'company_response_to_consumer',
       'timely_response', 'consumer_disputed?', 'complaint_id'],
      dtype='object')

**Задача**: Многоклассовая классификация текстовых жалоб потребителей на финансовые продукты/услуги

**Цель**: Разработать модель машинного обучения, которая автоматически определяет категорию финансового продукта на основе текстового описания жалобы потребителя.

In [9]:
filtered_df.shape

(50850, 18)

In [10]:
filtered_df

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,09/17/2013,Mortgage,Conventional adjustable mortgage (ARM),"Loan modification,collection,foreclosure",,,,"SunTrust Banks, Inc.",CA,94551,,,Web,09/18/2013,Closed with explanation,Yes,Yes,530602
1,09/17/2013,Credit reporting,,Incorrect information on credit report,Information is not mine,,,Equifax,RI,02921,,,Postal mail,10/10/2013,Closed with explanation,Yes,No,531346
2,08/30/2013,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,Nationstar Mortgage,FL,34684,Older American,,Phone,09/04/2013,Closed with explanation,Yes,Yes,510410
3,09/10/2013,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,,,U.S. Bancorp,MO,63138,,,Web,09/10/2013,Closed with explanation,Yes,No,521374
4,09/17/2013,Credit card,,Customer service / Customer relations,,,,U.S. Bancorp,OH,45387,,,Web,09/17/2013,Closed with explanation,Yes,No,531857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55589,10/02/2014,Credit reporting,,Incorrect information on credit report,Account status,,,Equifax,,,,,Referral,10/02/2014,Closed with non-monetary relief,Yes,Yes,1054982
55590,08/28/2015,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,Company chooses not to provide a public response,Wells Fargo & Company,,,,,Referral,09/14/2015,Closed with explanation,Yes,No,1541810
55591,05/06/2014,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,,,Revenue Assistance Corporation dba Revenue Group,,,,,Referral,05/09/2014,Closed with explanation,Yes,No,840618
55592,08/05/2015,Bank account or service,Other bank product/service,"Account opening, closing, or management",,,,Capital One,,,,,Referral,08/10/2015,Closed with explanation,Yes,No,1504319


Пропущенные значения:

In [11]:
filtered_df.isnull()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,False,False,False,False,True,True,True,False,False,False,True,True,False,False,False,False,False,False
1,False,False,True,False,False,True,True,False,False,False,True,True,False,False,False,False,False,False
2,False,False,False,False,True,True,True,False,False,False,False,True,False,False,False,False,False,False
3,False,False,False,False,True,True,True,False,False,False,True,True,False,False,False,False,False,False
4,False,False,True,False,True,True,True,False,False,False,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55589,False,False,True,False,False,True,True,False,True,True,True,True,False,False,False,False,False,False
55590,False,False,False,False,True,True,False,False,True,True,True,True,False,False,False,False,False,False
55591,False,False,False,False,False,True,True,False,True,True,True,True,False,False,False,False,False,False
55592,False,False,False,False,True,True,True,False,True,True,True,True,False,False,False,False,False,False


In [12]:
filtered_df.isnull().sum()

Unnamed: 0,0
date_received,0
product,0
sub_product,15759
issue,0
sub_issue,31599
consumer_complaint_narrative,44967
company_public_response,42971
company,0
state,446
zipcode,416


In [13]:
X = filtered_df.drop(['product', 'date_received'], axis=1)
y = filtered_df['product']

0: '**date_received**' - Дата получения жалобы

1: '**product**' - Продукт/Услуга

2: '**sub_product**' - Подкатегория продукта

3: '**issue**' - Проблема/Вопрос

4: '**sub_issue**' - Подкатегория проблемы

5: '**consumer_complaint_narrative**' - Описание жалобы потребителя

6: '**company_public_response**' - Публичный ответ компании

7: '**company**' - Компания

8: '**state**' - Штат

9: '**zipcode**' - Почтовый индекс

10: '**tags**' - Метки/Теги

11: '**consumer_consent_provided**' - Согласие потребителя предоставлено

12: '**submitted_via**' - Способ подачи жалобы

13: '**date_sent_to_company**' - Дата отправки компании

14: '**company_response_to_consumer**' - Ответ компании потребителю

15: '**timely_response**' - Своевременный ответ

16: '**consumer_disputed**?' - Оспорено потребителем?

17: '**complaint_id**' - ID жалобы

Не будем использовать date_received, complaint_id

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

N_train, _ = X_train.shape
N_test,  _ = X_test.shape

N_train, N_test

(38137, 12713)

In [199]:
X_train.isna().sum()

Unnamed: 0,0
issue,0
consumer_complaint_narrative,33746
company,0
state,334
tags,32824
consumer_consent_provided,29876
submitted_via,0
company_response_to_consumer,0
timely_response,0
consumer_disputed?,0


In [15]:
X_test.isna().sum()

Unnamed: 0,0
sub_product,3972
issue,0
sub_issue,7848
consumer_complaint_narrative,11221
company_public_response,10718
company,0
state,112
zipcode,106
tags,10919
consumer_consent_provided,9989


In [16]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_cat.fit(X_train[['sub_product', 'sub_issue', 'state','zipcode','tags','consumer_consent_provided']])

In [17]:
X_train[['sub_product', 'sub_issue','state','zipcode','tags','consumer_consent_provided']] = imp_cat.transform(X_train[['sub_product', 'sub_issue','state','zipcode','tags','consumer_consent_provided']])

In [18]:
X_train.isna().sum()

Unnamed: 0,0
sub_product,0
issue,0
sub_issue,0
consumer_complaint_narrative,33746
company_public_response,32253
company,0
state,0
zipcode,0
tags,0
consumer_consent_provided,0


# Бинаризация:

In [19]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop='if_binary', sparse_output=False)
enc.fit(X_train[['issue', 'state','company','tags','submitted_via', 'timely_response']])

In [20]:
dummies = pd.DataFrame(enc.transform(X_train[['issue','state','company','tags','submitted_via', 'timely_response']]),
                       columns=enc.get_feature_names_out(), index=X_train.index)
dummies.head()

Unnamed: 0,issue_APR or interest rate,"issue_Account opening, closing, or management",issue_Advertising and marketing,issue_Application processing delay,"issue_Application, originator, mortgage broker",issue_Arbitration,issue_Balance transfer,issue_Balance transfer fee,issue_Bankruptcy,issue_Billing disputes,...,tags_Older American,"tags_Older American, Servicemember",tags_Servicemember,submitted_via_Email,submitted_via_Fax,submitted_via_Phone,submitted_via_Postal mail,submitted_via_Referral,submitted_via_Web,timely_response_Yes
11785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
38170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
28163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
29658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
16827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [21]:
X_train = pd.concat((X_train, dummies), axis=1).drop(['issue','state','company','tags','submitted_via', 'timely_response'], axis=1)

In [22]:
X_train.head()

Unnamed: 0,sub_product,sub_issue,consumer_complaint_narrative,company_public_response,zipcode,consumer_consent_provided,date_sent_to_company,company_response_to_consumer,consumer_disputed?,complaint_id,...,tags_Older American,"tags_Older American, Servicemember",tags_Servicemember,submitted_via_Email,submitted_via_Fax,submitted_via_Phone,submitted_via_Postal mail,submitted_via_Referral,submitted_via_Web,timely_response_Yes
11785,I do not know,Debt is not mine,,,45122,Consent provided,08/19/2014,Closed with explanation,No,990815,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
38170,Other mortgage,Account status,,,37205,Consent provided,08/07/2013,Closed with explanation,No,474764,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
28163,Payday loan,Threatened to take legal action,"On XXXX XXXX, 2015 @ XXXX I was contacted by s...",Company disputes the facts presented in the co...,392XX,Consent provided,10/19/2015,Closed with explanation,Yes,1567946,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
29658,Medical,Right to dispute notice not received,after being denied a home loan i pulled a cred...,Company can't verify or dispute the facts in t...,457XX,Consent provided,09/21/2015,Closed with explanation,No,1573575,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
16827,Other mortgage,Account status,,,04027,Consent provided,01/09/2015,Closed with explanation,No,1182086,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [210]:
X_train.head()

Unnamed: 0,consumer_consent_provided,company_response_to_consumer,consumer_disputed?,issue_APR or interest rate,"issue_Account opening, closing, or management",issue_Advertising and marketing,issue_Application processing delay,"issue_Application, originator, mortgage broker",issue_Arbitration,issue_Balance transfer,...,submitted_via_Phone,submitted_via_Postal mail,submitted_via_Referral,submitted_via_Web,timely_response_Yes,Credit reporting,Debt collection,Mortgage,Credit card,Bank account or service
11785,,Closed with explanation,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0
38170,,Closed with explanation,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0,0,0,0,0
28163,Consent provided,Closed with explanation,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0
29658,Consent provided,Closed with explanation,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0,0,0,0,0
16827,,Closed with explanation,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0
