In [1]:
!wget https://files.consumerfinance.gov/ccdb/complaints.csv.zip

--2023-06-13 09:57:03--  https://files.consumerfinance.gov/ccdb/complaints.csv.zip
Resolving files.consumerfinance.gov (files.consumerfinance.gov)... 52.84.45.28, 52.84.45.27, 52.84.45.65, ...
Connecting to files.consumerfinance.gov (files.consumerfinance.gov)|52.84.45.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 565215793 (539M) [binary/octet-stream]
Saving to: ‘complaints.csv.zip’


2023-06-13 09:57:41 (14.6 MB/s) - ‘complaints.csv.zip’ saved [565215793/565215793]



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
!unzip complaints.csv.zip

chunk = pd.read_csv('complaints.csv',chunksize=1000000, dtype=object)
df = pd.concat(chunk)

Archive:  complaints.csv.zip
  inflating: complaints.csv          


In [4]:
df = df[['Consumer complaint narrative','Product']]
print(df.head())

                        Consumer complaint narrative  \
0                                                NaN   
1  Please help ASAP, Ive found the following acco...   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                                             Product  
0          Payday loan, title loan, or personal loan  
1  Credit reporting, credit repair services, or o...  
2  Credit reporting, credit repair services, or o...  
3  Credit reporting, credit repair services, or o...  
4  Credit reporting, credit repair services, or o...  


In [5]:
classification_categories = ['Credit reporting, credit repair services, or other personal consumer reports','Debt collection','Consumer Loan','Mortgage']

df = df[df['Product'].isin(classification_categories)]
print(df.head())

                        Consumer complaint narrative  \
1  Please help ASAP, Ive found the following acco...   
2                                                NaN   
3                                                NaN   
4                                                NaN   
5  Please consider this as my formal complaint to...   

                                             Product  
1  Credit reporting, credit repair services, or o...  
2  Credit reporting, credit repair services, or o...  
3  Credit reporting, credit repair services, or o...  
4  Credit reporting, credit repair services, or o...  
5  Credit reporting, credit repair services, or o...  


In [6]:
print('\nBefore dropping the nulls')
print('Null count', df.isna().sum())
print('Total rows of data', len(df))
df.dropna(inplace=True)
print('='*80)
print('After dropping the nulls')
print('Null count', df.isna().sum())
print('Total rows of data', len(df))

print("\n")
print(df.head())
print('\nShape of data',df.shape)


Before dropping the nulls
Null count Consumer complaint narrative    1827170
Product                               0
dtype: int64
Total rows of data 2834189
After dropping the nulls
Null count Consumer complaint narrative    0
Product                         0
dtype: int64
Total rows of data 1007019


                          Consumer complaint narrative  \
1    Please help ASAP, Ive found the following acco...   
5    Please consider this as my formal complaint to...   
8    XXXX deleted this debt from my credit report b...   
21   I never received a bill or heard anything from...   
110  Seriously, it's been months since I investigat...   

                                               Product  
1    Credit reporting, credit repair services, or o...  
5    Credit reporting, credit repair services, or o...  
8                                      Debt collection  
21                                     Debt collection  
110  Credit reporting, credit repair services, or o...  

Shap

In [7]:
print("\n\nTest reain split\n")
X_train, X_test, y_train, y_test = train_test_split(
                            df['Consumer complaint narrative'], df['Product'],
                        test_size=0.15, random_state=0, stratify=df['Product'])

X_train, X_val, y_train, y_val = train_test_split(
                            X_train, y_train,
                        test_size=0.25, random_state=0, stratify=y_train)

print(f'Training utterances: {len(X_train)} of shape {X_train.shape}')
print(f'Validation utterances: {len(X_val)} of shape {X_val.shape}')
print(f'Testing utterances: {len(X_test)} of shape {X_test.shape}\n')



Test reain split

Training utterances: 641974 of shape (641974,)
Validation utterances: 213992 of shape (213992,)
Testing utterances: 151053 of shape (151053,)



In [8]:
vectorizer = TfidfVectorizer(stop_words= 'english')
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec  = vectorizer.transform(X_test)

In [9]:
n_features=100

#print(X_train_vec)
ch2 = SelectKBest(chi2, k=n_features)
X_train_sp = ch2.fit_transform(X_train_vec, y_train)
X_val_sp  = ch2.transform(X_val_vec)
X_test_sp  = ch2.transform(X_test_vec)


In [10]:
columns = list(range(n_features))

X_train_dense = pd.DataFrame(data=X_train_sp.toarray(), columns=columns)
X_val_dense  = pd.DataFrame(data=X_val_sp.toarray(), columns=columns)
X_test_dense  = pd.DataFrame(data=X_test_sp.toarray(), columns=columns)
print(X_train_dense.shape, X_test_dense.shape)
#print(X_train_dense)
print(y_train.shape,y_test.shape)
print(type(y_train.shape),type(y_test.shape))
print(y_train,y_test)

(641974, 100) (151053, 100)
(641974,) (151053,)
<class 'tuple'> <class 'tuple'>
1490824    Credit reporting, credit repair services, or o...
961723     Credit reporting, credit repair services, or o...
848300     Credit reporting, credit repair services, or o...
2283605    Credit reporting, credit repair services, or o...
1271272    Credit reporting, credit repair services, or o...
                                 ...                        
711624     Credit reporting, credit repair services, or o...
2181025    Credit reporting, credit repair services, or o...
2377714    Credit reporting, credit repair services, or o...
1858639    Credit reporting, credit repair services, or o...
537096                                       Debt collection
Name: Product, Length: 641974, dtype: object 1298002    Credit reporting, credit repair services, or o...
3278297    Credit reporting, credit repair services, or o...
712695     Credit reporting, credit repair services, or o...
677505               

In [11]:
encoder = LabelEncoder()
y_train_num  = encoder.fit_transform(y_train)
y_val_num   = encoder.transform(y_val)
y_test_num   = encoder.transform(y_test)

print(type(y_train_num),y_train_num)

mapping = {l: i for i, l in enumerate(encoder.classes_)}
print(mapping)

<class 'numpy.ndarray'> [1 1 1 ... 1 1 2]
{'Consumer Loan': 0, 'Credit reporting, credit repair services, or other personal consumer reports': 1, 'Debt collection': 2, 'Mortgage': 3}


In [12]:
rf_model  = RandomForestClassifier(n_estimators=200, random_state=39, n_jobs = -1)
print(y_train_num)
"""scores = cross_val_score(rf_model,
                         X_train_dense,
                         y_train_num,
                         cv=5,
                         n_jobs = -1,
                         scoring = 'accuracy')
scores.mean()
"""


[1 1 1 ... 1 1 2]


"scores = cross_val_score(rf_model,\n                         X_train_dense,\n                         y_train_num,\n                         cv=5,\n                         n_jobs = -1,\n                         scoring = 'accuracy')\nscores.mean()\n"

In [15]:
#print(rf_model.score(X_test_dense, y_test_num))
rf_model.fit(X_train_dense, y_train_num)
preds=rf_model.predict(X_val_dense)
print('validation accuract')
report = pd.DataFrame(columns=['Complaint','Actual Product','Prediction'])
report['Complaint']      = X_val
report['Actual Product'] = y_val_num
report['Prediction']     = preds
report['Correct'] = (report['Actual Product'] == report['Prediction']).astype('int')
print(report)
print(f'Accuracy: {100*report.Correct.sum()/report.Correct.count()} %')

validation accuract
                                                 Complaint  Actual Product  \
2574822  To whom this may concern at the Consumer Finan...               1   
1141608  This is not a duplicate nor is this complaint ...               1   
1564589  I received a copy of my credit report and I ha...               1   
349905   In accordance with the Fair Credit Reporting A...               1   
2844631  Escallate LLC is demanding that I pay {$450.00...               2   
...                                                    ...             ...   
1034111  The mortgage was paid in full on XX/XX/2021 an...               3   
3365275  I have no idea who this caller is but I've blo...               2   
3254123  Yesterday I learned that my Experian report sa...               1   
3701981  I have an unverified account from CMI. I had p...               2   
2005486  My XX/XX/2020 payment shows I was over 30 days...               1   

         Prediction  Correct  
2574822     

In [16]:

preds=rf_model.predict(X_test_dense)
print('test accuract')
report = pd.DataFrame(columns=['Complaint','Actual Product','Prediction'])
report['Complaint']      = X_test
report['Actual Product'] = y_test_num
report['Prediction']     = preds
report['Correct'] = (report['Actual Product'] == report['Prediction']).astype('int')
print(report)
print(f'Accuracy: {100*report.Correct.sum()/report.Correct.count()} %')

validation accuract
                                                 Complaint  Actual Product  \
1298002  I've filed a dispute about my credit report an...               1   
3278297  In accordance with the Fair Credit Reporting A...               1   
712695   I mailed Transunion & XXXX  a letter on XX/XX/...               1   
677505   On XXXX, I paid UNITED WHOLESALE MORTGAGE {$25...               3   
3284273  Seized my car. Debt was XXXX. Continue to thre...               2   
...                                                    ...             ...   
3284676  According to the Fair Credit Reporting Act, Se...               1   
2713386  transunion is tryiing to commit an error sayin...               1   
2361836  I have a closed account on my credit report th...               1   
1682306  I closed on my house XX/XX/XXXX. On XX/XX/XXXX...               3   
960293   I applied for XXXXXXXX XXXX credit card with X...               1   

         Prediction  Correct  
1298002     