## Imports

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset
from sklearn.metrics import classification_report
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


All imports successful ✅


## Data load and cleaning

In [None]:
## Load dataset
df = pd.read_csv('data/complaints.csv')
print(df.columns)

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')


In [6]:
# Filter rows with non-null complaint text using exact column name
df = df[df['Consumer complaint narrative'].notnull()]

# Select relevant columns and rename
dataset_df = df[['Consumer complaint narrative', 'Product']].rename(
    columns={'Consumer complaint narrative': 'text', 'Product': 'category'}
)

print(dataset_df.head())

                                                 text  \
2   usc section 1681 states that there must be wri...   
4   PNC Bank will not allow me to link my accounts...   
7   Ally Financial has not reported to the credit ...   
12  In my initial letter dated XX/XX/2019, to Hyun...   
13  I am the consumer natural person making this r...   

                                             category  
2   Credit reporting, credit repair services, or o...  
4                         Checking or savings account  
7   Credit reporting, credit repair services, or o...  
12                                    Debt collection  
13                                    Debt collection  


In [7]:
## Encode labels
labels = dataset_df['category'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

dataset_df['label'] = dataset_df['category'].map(label2id)

print(label2id)

{'Credit reporting, credit repair services, or other personal consumer reports': 0, 'Checking or savings account': 1, 'Debt collection': 2, 'Mortgage': 3, 'Payday loan, title loan, or personal loan': 4, 'Credit reporting or other personal consumer reports': 5, 'Credit card or prepaid card': 6, 'Credit card': 7, 'Student loan': 8, 'Money transfer, virtual currency, or money service': 9, 'Vehicle loan or lease': 10, 'Bank account or service': 11, 'Credit reporting': 12, 'Prepaid card': 13, 'Payday loan, title loan, personal loan, or advance loan': 14, 'Debt or credit management': 15, 'Consumer Loan': 16, 'Money transfers': 17, 'Payday loan': 18, 'Other financial service': 19, 'Virtual currency': 20}


In [8]:
## Convert to Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(dataset_df[['text', 'label']])

In [10]:
dataset_df.to_csv('data/complaints_cleaned.csv', index=False)


In [None]:
## Tokenization

from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define tokenization function
def tokenize_function(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Check one sample to verify tokenization
print(dataset[0])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 3021026/3021026 [14:00<00:00, 3596.33 examples/s] 


{'label': tensor(0), 'input_ids': tensor([  101, 15529,  2930, 16923,  2487,  2163,  2008,  2045,  2442,  2022,
         2517, 20104,  2013,  1996,  2640,  4402,  2005,  2151, 27050,  1998,
         2036,  2930,  3438,  2683,  1997,  1996,  4429,  2527,  2163,  2045,
         2442,  2022, 16442,  1999,  2344,  2005,  2023,  2000,  2022,  3423,
         2122,  2024,  6206,  6078,  2011,  2122,  3316,  1998,  2065,  2025,
        11915,  2098,  2085,  1045,  2097,  2202,  3423,  2895, 10047,  4168,
        25205,  9834,  2100,  2122, 27050,  2024,  4242,  1998, 24641,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [12]:
## Save tokenized dataset in HuggingFace format
dataset.save_to_disk('data/tokenized_complaints')

Saving the dataset (25/25 shards): 100%|██████████| 3021026/3021026 [00:40<00:00, 73684.41 examples/s] 


In [1]:
from datasets import load_from_disk
dataset = load_from_disk('data/tokenized_complaints')

  from .autonotebook import tqdm as notebook_tqdm
