<a href="https://colab.research.google.com/github/Jefffish09/MachineLearning/blob/dev/Classification/multiclass/albert_multiclass_simpletransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

* https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a
* https://github.com/ThilinaRajapakse/simpletransformers
* https://simpletransformers.ai/

Data Source:

* https://www.kaggle.com/cfpb/us-consumer-finance-complaints


In [2]:
!pip install -U simpletransformers

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/6d/ee/99e2809fb311841376fe01f3524a912b1907d7b45d445f16ad27b4422c9f/simpletransformers-0.60.9-py3-none-any.whl (206kB)
[K     |█▋                              | 10kB 23.8MB/s eta 0:00:01[K     |███▏                            | 20kB 14.3MB/s eta 0:00:01[K     |████▊                           | 30kB 8.9MB/s eta 0:00:01[K     |██████▍                         | 40kB 7.5MB/s eta 0:00:01[K     |████████                        | 51kB 5.2MB/s eta 0:00:01[K     |█████████▌                      | 61kB 5.7MB/s eta 0:00:01[K     |███████████                     | 71kB 6.1MB/s eta 0:00:01[K     |████████████▊                   | 81kB 6.4MB/s eta 0:00:01[K     |██████████████▎                 | 92kB 6.3MB/s eta 0:00:01[K     |███████████████▉                | 102kB 6.5MB/s eta 0:00:01[K     |█████████████████▍              | 112kB 6.5MB/s eta 0:00:01[K     |███████████████████           

In [1]:
import pandas as pd
import numpy as np
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs


In [2]:
seed = 2021
max_len = 128
learning_rate = 4e-5
batch_size = 64
epochs = 100
earlystopping_patience = 3

In [3]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [4]:
# Import data from csv
df = pd.read_csv("/content/drive/MyDrive/consumer_complaints.csv", 
                usecols=("product", "consumer_complaint_narrative"),
                dtype={"consumer_complaint_narrative": object})

# Only interested in data with consumer complaints
df = df[df["consumer_complaint_narrative"].notnull()]
df = df[df["product"].notnull()]
df.reset_index(drop=True, inplace=True)
df = df.dropna()

df.head()

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


Unnamed: 0,product,consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [5]:
# Set your model output as categorical and save in new label col
df["product_label"] = pd.Categorical(df["product"])
df["product"] = df["product_label"].cat.codes
df.head()

Unnamed: 0,product,consumer_complaint_narrative,product_label
0,4,XXXX has claimed I owe them {$27.00} for XXXX ...,Debt collection
1,1,Due to inconsistencies in the amount owed that...,Consumer Loan
2,6,In XX/XX/XXXX my wages that I earned at my job...,Mortgage
3,6,I have an open and current mortgage with Chase...,Mortgage
4,6,XXXX was submitted XX/XX/XXXX. At the time I s...,Mortgage


In [6]:
labels_names_dict = dict(enumerate(df["product_label"].cat.categories))
labels = []
label_names = []
for i in labels_names_dict:
  labels.append(i)
  label_names.append(labels_names_dict[i])
labels_list = list(labels_names_dict.keys())
target_names = [labels_names_dict[k] for k in labels_list]

print(labels_names_dict)

{0: 'Bank account or service', 1: 'Consumer Loan', 2: 'Credit card', 3: 'Credit reporting', 4: 'Debt collection', 5: 'Money transfers', 6: 'Mortgage', 7: 'Other financial service', 8: 'Payday loan', 9: 'Prepaid card', 10: 'Student loan'}


In [7]:
# Split into train and test - stratify over Issue
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=seed, stratify=df["product_label"])

df_train = df_train[["consumer_complaint_narrative", "product"]]
df_train.columns = ["text", "labels"]
df_train.reset_index(drop=True, inplace=True)
df_train.head()

df_valid = df_valid[["consumer_complaint_narrative", "product"]]
df_valid.columns = ["text", "labels"]
df_valid.reset_index(drop=True, inplace=True)
df_valid.head()

Unnamed: 0,text,labels
0,"On XXXX XXXX, 2014 I ordered medical products ...",2
1,"On or about,XX/XX/XXXX, My husband and I, boug...",6
2,Experian has been replacing old information ov...,3
3,"I wrote in all of my personal information, and...",3
4,"In XX/XX/XXXX2015, I refinanced my mortgage w/...",6


In [8]:
def plot_perf(history):
  # Create a plot of accuracy and loss over time
  history_dict = history.history
  history_dict.keys()

  acc = history_dict["accuracy"]
  val_acc = history_dict["val_accuracy"]
  loss = history_dict["loss"]
  val_loss = history_dict["val_loss"]

  epochs = range(1, len(acc) + 1)

  # Use plot styling from seaborn.
  sns.set(style="darkgrid")
  # Increase the plot size and font size.
  sns.set(font_scale=1.5)
  plt.rcParams["figure.figsize"] = (12, 6)

  # Training and validation loss
  plt.plot(epochs, loss, "r", label="Training loss")
  # b is for "solid blue line"
  plt.plot(epochs, val_loss, "b", label="Validation loss")
  plt.title("Training and validation loss")
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.legend()
  plt.show()

  # Training and validation accuracy
  plt.plot(epochs, acc, "r", label="Training acc")
  plt.plot(epochs, val_acc, "b", label="Validation acc")
  plt.title("Training and validation accuracy")
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend(loc="lower right")
  plt.show()

In [9]:
# Optional model configuration
model_args = ClassificationArgs(
    # https://simpletransformers.ai/docs/usage/
    output_dir="outputs/",
    best_model_dir="outputs/best_model",
    num_train_epochs=epochs,
    train_batch_size=batch_size,
    learning_rate=learning_rate,
    max_seq_length=max_len,
    max_grad_norm=1.0,
    gradient_accumulation_steps=1,
    fp16=True,
    optimizer="AdamW",
    labels_list=labels_list,
    do_lower_case=True,
    evaluate_during_training=True,
    evaluate_during_training_verbose=True,
    evaluate_during_training_steps=1000,
    eval_batch_size=128,
    warmup_steps=0,
    warmup_ratio=0.06,
    weight_decay=0,
    use_early_stopping=True,
    early_stopping_patience=earlystopping_patience,
    early_stopping_metric="eval_loss",
    early_stopping_delta=0.001,
    early_stopping_consider_epochs=True,
    early_stopping_metric_minimize=True,
    no_save=True,
    save_model_every_epoch=False,
    save_eval_checkpoints=False,
    overwrite_output_dir=True,
    use_multiprocessing=True,
    use_multiprocessing_for_evaluation=False,
    no_cache=True
    )

# Create a ClassificationModel
model = ClassificationModel(
    # https://huggingface.co/transformers/pretrained_models.html
    "albert",
    "albert-base-v1",
    num_labels=len(labels_list),
    args=model_args
) 

# Train the model
model.train_model(train_df=df_train, eval_df=df_valid, show_running_loss=True, verbose=True)


INFO:filelock:Lock 140336865819280 acquired on /root/.cache/huggingface/transformers/93ddc3e8df1e0ef845f777cce9adb8292ea7649aded41347e1868666fe3495fa.026c3e14139b744b36c015e2f6517310a5e31ea4c089e595d05f27d651b155c8.lock


Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

INFO:filelock:Lock 140336865819280 released on /root/.cache/huggingface/transformers/93ddc3e8df1e0ef845f777cce9adb8292ea7649aded41347e1868666fe3495fa.026c3e14139b744b36c015e2f6517310a5e31ea4c089e595d05f27d651b155c8.lock
INFO:filelock:Lock 140336847211344 acquired on /root/.cache/huggingface/transformers/0b4420ac89d14bc19d46458a159f451a619bd17b515786e4151f5d9acd49a24b.3577b33a08c2cbb5c867544c04066de85471a62b95dc3704a1399228b4463ca0.lock


Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

INFO:filelock:Lock 140336847211344 released on /root/.cache/huggingface/transformers/0b4420ac89d14bc19d46458a159f451a619bd17b515786e4151f5d9acd49a24b.3577b33a08c2cbb5c867544c04066de85471a62b95dc3704a1399228b4463ca0.lock
Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertFo

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

INFO:filelock:Lock 140336846837520 released on /root/.cache/huggingface/transformers/bae19a6bb15f98e0fdde25179eb6585fe7a7beeb5c382ce585de1977df33fe6a.d6110e25022b713452eb83d5bfa8ae64530995a93d8e694fe52e05aa85dd3a7d.lock
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Running Epoch 0 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7255464346616539, 'eval_loss': 0.7658656716346741}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3


Running Epoch 1 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7375899518388772, 'eval_loss': 0.7278527889932905}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.7979805559263636, 'eval_loss': 0.5399632226853144}


Running Epoch 2 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8018421681333674, 'eval_loss': 0.5330974774701255}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8179757105503779, 'eval_loss': 0.4884428183237712}


Running Epoch 3 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8235309583563206, 'eval_loss': 0.4740326151961372}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8163726118530508, 'eval_loss': 0.5019544391405015}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3


Running Epoch 4 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8261780852783012, 'eval_loss': 0.4768528742449624}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8319365306673637, 'eval_loss': 0.4472910083475567}


Running Epoch 5 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8335446352195797, 'eval_loss': 0.4413352705183483}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8342016171405536, 'eval_loss': 0.4587001335053217}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 1
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3


Running Epoch 6 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8314261884468004, 'eval_loss': 0.45703090741520835}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 2
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3


Running Epoch 7 of 100:   0%|          | 0/836 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8292747895144332, 'eval_loss': 0.5015386893635705}
INFO:simpletransformers.classification.classification_model: No improvement in eval_loss
INFO:simpletransformers.classification.classification_model: Current step: 3
INFO:simpletransformers.classification.classification_model: Early stopping patience: 3
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'mcc': 0.8252998079670061, 'eval_loss': 0.4947415801740828}
INFO:simpletransformers.classification.classification_model: Patience of 3 steps reached
INFO:simpletransformers.classification.classification_model: Training terminated.
INFO:simpletransformers.classification.classification_model: Training of albert model complete. Saved to outp

(6688,
 {'eval_loss': [0.7658656716346741,
   0.7278527889932905,
   0.5399632226853144,
   0.5330974774701255,
   0.4884428183237712,
   0.4740326151961372,
   0.5019544391405015,
   0.4768528742449624,
   0.4472910083475567,
   0.4413352705183483,
   0.4587001335053217,
   0.45703090741520835,
   0.5015386893635705,
   0.4947415801740828],
  'global_step': [836,
   1000,
   1672,
   2000,
   2508,
   3000,
   3344,
   4000,
   4180,
   5000,
   5016,
   5852,
   6000,
   6688],
  'mcc': [0.7255464346616539,
   0.7375899518388772,
   0.7979805559263636,
   0.8018421681333674,
   0.8179757105503779,
   0.8235309583563206,
   0.8163726118530508,
   0.8261780852783012,
   0.8319365306673637,
   0.8335446352195797,
   0.8342016171405536,
   0.8314261884468004,
   0.8292747895144332,
   0.8252998079670061],
  'train_loss': [1.309936285018921,
   0.9439917802810669,
   0.39618971943855286,
   0.6590621471405029,
   0.32222259044647217,
   0.5722545981407166,
   0.1733809858560562,
   0.5054

In [10]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(
    df_valid,
    # acc=classification_report
    )

texts = df_valid["text"].tolist()
predictions, raw_outputs = model.predict(texts)
true_list = df_valid["labels"].tolist()

report = classification_report(y_true=true_list, y_pred=predictions, labels=labels_list, target_names=target_names, digits=3, output_dict=False)
print(report)
print("")
print("Accuracy: {}".format(accuracy_score(y_true=true_list, y_pred=predictions)))
print("Weighted f1-score: {}".format(f1_score(y_true=true_list, y_pred=predictions, average="weighted")))
print("Macro f1-score: {}".format(f1_score(y_true=true_list, y_pred=predictions, average="macro")))
print("Micro f1-score: {}".format(f1_score(y_true=true_list, y_pred=predictions, average="micro")))

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Evaluation:   0%|          | 0/105 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.8252998079670061, 'eval_loss': 0.4947415801740828}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/105 [00:00<?, ?it/s]

                         precision    recall  f1-score   support

Bank account or service      0.799     0.802     0.801      1142
          Consumer Loan      0.788     0.655     0.715       736
            Credit card      0.866     0.761     0.810      1586
       Credit reporting      0.843     0.907     0.874      2505
        Debt collection      0.843     0.879     0.860      3511
        Money transfers      0.727     0.662     0.693       133
               Mortgage      0.935     0.954     0.945      2984
Other financial service      0.000     0.000     0.000        22
            Payday loan      0.670     0.407     0.506       145
           Prepaid card      0.750     0.733     0.741       172
           Student loan      0.857     0.871     0.864       426

               accuracy                          0.857     13362
              macro avg      0.734     0.694     0.710     13362
           weighted avg      0.854     0.857     0.854     13362


Accuracy: 0.857207004

  _warn_prf(average, modifier, msg_start, len(result))
