In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter 
from scipy.spatial.distance import pdist


In [2]:
import accuracy

In [3]:
from sklearn import metrics

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metrics.accuracy_score(y_true=p.label_ids, y_pred=preds)


In [4]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

from datasets import Dataset
from accelerate import Accelerator
import tqdm as notebook_tqdm


In [None]:

def focal_loss(logits, labels, gamma=1, alpha=0.125, num_items_in_batch=None):
    # Calculate standard cross-entropy loss first.
    ce_loss = F.cross_entropy(logits, labels, reduction='none')
    
    # Get softmax probabilities.
    pt = torch.exp(-ce_loss)
    
    # Compute focal loss.
    focal_loss = alpha * (1 - pt) ** gamma * ce_loss

    num_items_in_batch=None

    return focal_loss.mean()

In [6]:
from bs4 import BeautifulSoup
import re
from sklearn import preprocessing

In [7]:
#  from gensim.models import word2vec

In [8]:
from transformers import Pipeline, RobertaTokenizer, RobertaModel, AutoTokenizer, DistilBertModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, DistilBertConfig, DistilBertTokenizer, DistilBertTokenizerFast, DistilBertPreTrainedModel, DistilBertForTokenClassification, DistilBertForSequenceClassification
import evaluate

In [9]:
from sklearn import metrics

In [10]:
class CustomLossTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, num_items_in_batch=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Store your custom loss function.
        # This should take (logits, labels) as arguments.
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Assume your inputs include "labels" and your model returns logits.
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # Compute the custom loss using your loss function.
        loss = self.loss_fn(logits, labels)
        
        num_items_in_batch=None
        
        return (loss, outputs) if return_outputs else loss

In [11]:
metric = evaluate.load("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

In [12]:
complaints = pd.read_csv("../data/complaints_full.csv")

In [13]:
complaints  

Unnamed: 0,Consumer complaint narrative,Issue
0,In accordance with the fair credit reporting a...,Improper use of your report
1,Credit inquiries on my account thats not mine,Improper use of your report
2,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
3,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
4,Now I originally signed up for Albert over 3 y...,Managing an account
...,...,...
911567,REINSTATEMENT PROBLEMS- DENIAL OF RIGHT TO CUR...,Trouble during payment process
911568,This is a very brief summary of a problem that...,Applying for a mortgage or refinancing an exis...
911569,I have tried to contact cash app about a fraud...,Fraud or scam
911570,"On XX/XX/XXXX Tuesday, after I switched my cho...",Trouble during payment process


In [14]:
# complaints.to_csv('../data/complaints_1000.csv', index=False)

In [15]:
class TextCleaner():
    def __init__(self):
        pass
    
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

cleaner = TextCleaner()
complaints['cleaned_text'] = complaints['Consumer complaint narrative'].apply(cleaner.clean_text)

In [16]:
le = preprocessing.LabelEncoder()
complaints['labels'] = le.fit_transform(complaints['Issue'].tolist())

In [17]:
complaints = complaints.drop(['Consumer complaint narrative', 'Issue'], axis=1)

In [18]:
train_df, test_df = train_test_split(complaints, test_size=0.2, stratify=complaints['labels'], random_state=321, shuffle=True)

In [19]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["cleaned_text"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/729257 [00:00<?, ? examples/s]

Map:   0%|          | 0/182315 [00:00<?, ? examples/s]

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
# Load pre-trained DistilBERT model (or another model) for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=len(le.classes_)) #, id2label=id2label, label2id=label2id)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    warmup_steps=500,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
    # push_to_hub=True
)

# Define Trainer object for training the model
trainer = CustomLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,    
    loss_fn=focal_loss   # pass your custom loss function here, e.g. focal_loss
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('model')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0845,0.083673
2,0.0742,0.07884
3,0.0685,0.076803




In [22]:
trainer.predict(tokenized_test)



PredictionOutput(predictions=array([[-1.8539884 ,  1.2869017 , -2.820937  , ..., -2.6207426 ,
         2.0979576 ,  1.5272754 ],
       [-2.9342282 ,  1.2239447 , -3.18478   , ..., -2.3848622 ,
        -0.5082115 ,  0.9714488 ],
       [-1.0187677 , -0.26679042, -2.6889002 , ..., -1.6065754 ,
         3.2760453 ,  0.9937186 ],
       ...,
       [-1.3515413 ,  0.86862063, -3.1072886 , ..., -2.2121375 ,
        -1.5847299 ,  0.4295195 ],
       [-0.83141345, -0.74413687, -2.4304998 , ..., -1.18501   ,
         4.3630624 , -1.1467575 ],
       [-1.9060173 ,  0.18451488, -3.2678254 , ..., -1.7830613 ,
         2.3865976 , -0.96159744]], shape=(182315, 25), dtype=float32), label_ids=array([18, 11, 18, ...,  9, 11, 11], shape=(182315,)), metrics={'test_loss': 0.07680337131023407, 'test_runtime': 1572.746, 'test_samples_per_second': 115.921, 'test_steps_per_second': 14.491})

In [23]:
np.argmax(trainer.predict(tokenized_test).predictions, axis=1)

array([18, 18, 18, ...,  9, 11, 11], shape=(182315,))

In [24]:
print(classification_report(
    y_true=tokenized_test['labels'],
    y_pred=np.argmax(trainer.predict(tokenized_test).predictions, axis=1),
    target_names=le.classes_
))

                                                                                  precision    recall  f1-score   support

                     Applying for a mortgage or refinancing an existing mortgage       0.82      0.69      0.75      2092
                                               Attempts to collect debt not owed       0.59      0.64      0.62     14633
                                                              Closing an account       0.72      0.73      0.73      1905
                                                           Communication tactics       0.72      0.71      0.72      4249
                                           Cont'd attempts collect debt not owed       0.58      0.53      0.55      3366
                                            Dealing with your lender or servicer       0.85      0.88      0.86      3255
                                              False statements or representation       0.51      0.32      0.39      4081
                       

In [25]:
tokenized_train

Dataset({
    features: ['cleaned_text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 729257
})

In [26]:
test_df = test_df.rename(columns={'cleaned_text': 'Consumer complaint narrative', 'labels': 'Issue'})
test_df

Unnamed: 0,Consumer complaint narrative,Issue
684406,i am attaching a copy of the letter i mailed t...,18
652421,xxxx xxxx xxxx credit card company closed our ...,11
14724,i have no clue whats going on so i have no cho...,18
295539,over the last 21 months i have on time payment...,6
140132,transunion is not correcting my name nor incor...,11
...,...,...
812163,i submitted a letter to the xxxx credit bureau...,18
38424,i value your help to removed a portion of the ...,11
575155,xxxx xxxx xxxx xxxx xxxx xxxx reporting late p...,9
512484,i opened a line of credit with upgrade on xxxx...,11


In [None]:
test_df.to_csv('../data/Xtest_Ypred_df_ComplaintsFullCSV_exported_Roberta_3iter_FocalLoss_Gamma1_AlphaPt125.csv', index=False)