<a href="https://colab.research.google.com/github/LeonGoergen/DocumentClassification/blob/main/dataPrep/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.3 transformers-4.27.4


In [None]:
import string
import re
import pandas as pd
import unicodedata
import nltk
import json
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Text Preprocessing

This notebook preprocesses the textual content of the created datasets from the [dataset preparation notebook](https://colab.research.google.com/drive/10vfHyaNdtTaYq7TEIpW1b_kTgN2coIip?usp=sharing)

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Bachelor Thesis/Datasets/train_nopreprocess.csv', sep="\t", header=0)
test = pd.read_csv('/content/drive/MyDrive/Bachelor Thesis/Datasets/test_nopreprocess.csv', sep="\t", header=0)

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

total_tokens = 0
total_chars = 0
all_tokens = []

all_df = train.append(test)
# iterate through each document in the "Consumer complaint narrative" column
for i, row in tqdm(all_df.iterrows(), total=all_df.shape[0]):
    # tokenize the document
    tokens = tokenizer.encode(row["Consumer complaint narrative"], max_length=2048, truncation=True)
    # add tokens to list
    all_tokens.extend(tokens)
    # add the number of tokens to the total
    total_tokens += len(tokens)
    # add the number of characters to the total
    total_chars += len(row["Consumer complaint narrative"])

# calculate the average number of tokens and characters per document
avg_tokens = total_tokens / len(all_df)
avg_chars = total_chars / len(all_df)

print("\nAverage tokens per document:", avg_tokens)
print("Average characters per document:", avg_chars)

100%|██████████| 94260/94260 [03:01<00:00, 518.83it/s]


Average tokens per document: 276.6344260555909
Average characters per document: 1224.7463505198386





In [None]:
unique_tokens = len(set(all_tokens))
print("Number of unique tokens in dataset:", unique_tokens)

Number of unique tokens in dataset: 34016


In [None]:
import math

# Define a function to count the number of x-character blocks in a document
def count_blocks(document, x):
    return math.ceil(len(document) / x)

# Apply the function to the text column of the DataFrame and sum the result
block_size = 100
num_blocks = all_df['Consumer complaint narrative'].apply(count_blocks, args=(block_size,)).sum()
print(f"Ratio for 100-character-blocks: {num_blocks/all_df.shape[0]}")

block_size = 1000
num_blocks = all_df['Consumer complaint narrative'].apply(count_blocks, args=(block_size,)).sum()
print(f"Ratio for 1000-character-blocks: {num_blocks/all_df.shape[0]}")

Ratio for 100-character-blocks: 12.741438574156588
Ratio for 1000-character-blocks: 1.7471461913855293


In [None]:
train['Consumer complaint narrative'].values[18]

"I initially obtained a loan in XX/XX/XXXX for a car lease for a XXXX XXXX XXXX, I obtained a loan from XXXX XXXX XXXX XXXX XXXX, as of XXXX it was changed to XXXX. I am in Predatory loan and need help out ; I am XXXX upside down in this loan from XXXX they were going to report me to the credit agency for {$27.00} yes XXXX dollars. When you ask for extension, they put so much interest on the deferred payment plan. I don't think I will ever be done paying for this loan, Please Help."

In [None]:
train['Product'].values[18]

'Vehicle loan or lease'

In [None]:
train['Complaint ID'].values[18]

5937822

In [None]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    punctuationfree = re.sub(r'[\W\n\t]+', ' ', punctuationfree)
    return punctuationfree

#storing the punctuation free text
train['Consumer complaint narrative'] = train['Consumer complaint narrative'].apply(lambda x:remove_punctuation(x))
test['Consumer complaint narrative'] = test['Consumer complaint narrative'].apply(lambda x:remove_punctuation(x))
train['Consumer complaint narrative'].values[18]

'I initially obtained a loan in XXXXXXXX for a car lease for a XXXX XXXX XXXX I obtained a loan from XXXX XXXX XXXX XXXX XXXX as of XXXX it was changed to XXXX I am in Predatory loan and need help out I am XXXX upside down in this loan from XXXX they were going to report me to the credit agency for 2700 yes XXXX dollars When you ask for extension they put so much interest on the deferred payment plan I dont think I will ever be done paying for this loan Please Help'

# Create Datasets for commercial providers

In [None]:
# For Commercial Providers
from sklearn.model_selection import train_test_split
import csv

train_basic = train[['Consumer complaint narrative', 'Product']]
test_basic = test[['Consumer complaint narrative', 'Product']]
train_basic = train_basic.rename(columns={'Consumer complaint narrative': 'text', 'Product': 'label'})
test_basic = test_basic.rename(columns={'Consumer complaint narrative': 'text', 'Product': 'label'})

train_basic.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/train_basic.csv", sep=',', index=False) # Levity, PlanetAI
test_basic.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/test_basic.csv", sep=',', index=False) # For All

train_split_basic, val_split_basic = train_test_split(train_basic, test_size=0.15)

train_split_basic.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/train_split_basic.csv", sep=',', index=False) # Google
val_split_basic.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/val_split_basic.csv", sep=',', index=False) # Google

train_basic_aws = train[['Product', 'Consumer complaint narrative']]
train_basic_aws.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/train_basic_aws.csv", sep=",", quoting=csv.QUOTE_ALL, index=False) # for AWS

In [None]:
# OpenAI

import json

# Get the unique values in the label column and sort them
label_values = sorted(train['Product'].unique())

# Map the label values to numerical labels
label_map = {value: index+1 for index, value in enumerate(label_values)}

# Loop over each row in your DataFrame and generate a JSON object for each
json_list = []
for index, row in train.iterrows():
    # Get the label and text for this row
    label = row['Product']
    text = row['Consumer complaint narrative']

    # Map the label to a numerical value
    label_num = label_map[label]

    # Construct the prompt string with the text and ### separator
    prompt = f"Content:{text}\n\n###\n\n"

    # Construct the completion string with the label number
    completion = f" {label_num}"

    # Construct the JSON object with prompt and completion
    json_obj = {'prompt': prompt, 'completion': completion}

    # Add the JSON object to the list
    json_list.append(json_obj)

print(label_map)
print('')
json_list[0]

{'Bank, checking or savings account': 1, 'Consumer Loan': 2, 'Credit card or prepaid card': 3, 'Credit reporting, credit repair services, or other personal consumer reports': 4, 'Debt collection': 5, 'Money transfer, virtual currency, or money service': 6, 'Mortgage': 7, 'Payday loan, title loan, or personal loan': 8, 'Student loan': 9, 'Vehicle loan or lease': 10}



{'prompt': 'Content:Date XXXX Amount XXXX litecoin Company XXXX Coinbase inc My account XXXX Action Failing to recognize an erroneous transaction I was trying to transfer Litecoin to my another account However I accidentally entered my Bitcoin address instead of my Litecoin address as the receiving address Of course this got rejected by the blockchain right away However XXXX confirmed this as a valid transaction As a consequence the XXXX litecoin has been withdrawn from my GDAX account although the transaction never happened Ever since Litecoin had dropped 30 in value and I lost the opportunity to sell it since XXXX had effectively confiscated my litecoins I contacted their customer supports and they havent got back to me in 2 months I attached two screenshots One shows the transfer history from my XXXX account and it says the transfer is complete However if I track the transfer address it says the Litecoins were never redeemed ie the transfer was never completed \n\n###\n\n',
 'comple

In [None]:
with open("/content/drive/MyDrive/Bachelor Thesis/Datasets/json_file_train_openai", "w") as fp:
    json.dump(json_list, fp)

In [None]:
# Microsoft Azure

json_file = {
    "projectFileVersion": "2022-05-01",
    "stringIndexType": "Utf16CodeUnit",
    "metadata": {
      "projectKind": "CustomSingleLabelClassification",
      "storageInputContainerName": "storagecontainer",
      "settings": {},
      "projectName": "document_classification_project",
      "multilingual": False,
      "description": "",
      "language": "en-us"
    },
    "assets": {
      "projectKind": "CustomSingleLabelClassification",
      "classes": [
          {
              "category": "Payday loan, title loan, or personal loan"
          },
          {
              "category": "Money transfer, virtual currency, or money service"
          },
          {
              "category": "Mortgage"
          },
          {
              "category": "Bank, checking or savings account"
          },
          {
              "category": "Student loan"
          },
          {
              "category": "Debt collection"
          },
          {
              "category": "Consumer Loan"
          },
          {
              "category": "Credit card or prepaid card"
          },
          {
              "category": "Credit reporting" # label cant have more than 50 characters
          },
          {
              "category": "Vehicle loan or lease"
          }
      ],
      "documents": [
      ]
    }
}

In [None]:
!rm -rf "/content/txt"
!mkdir "/content/txt"

for index, row in tqdm(train.iterrows(), total=test.shape[0]):
    with open("/content/txt/" + str(row['Complaint ID']) + ".txt", "w") as f:
        f.write(row['Consumer complaint narrative'])
    label = row['Product']
    if label == "Credit reporting, credit repair services, or other personal consumer reports":
        label = "Credit reporting"
    metadata = {
      "location": str(row['Complaint ID']) + ".txt",
      "language": "en-us",
      "dataset": "train",
      "class": {
          "category": label
      }
    }
    json_file['assets']['documents'].append(metadata)

80121it [00:16, 4833.00it/s]


In [None]:
with open("/content/drive/MyDrive/Bachelor Thesis/Datasets/json_file_train", "w") as fp:
    json.dump(json_file, fp)
!zip -r "/content/drive/MyDrive/Bachelor Thesis/Datasets/txt_files_train.zip" "/content/txt"

# Continue with text preprocessing

In [None]:
def remove_X(text):
    x_free = text.replace('X', '')
    x_free = re.sub(r' {2,}|\t+', ' ', x_free)
    return x_free

train['Consumer complaint narrative'] = train['Consumer complaint narrative'].apply(lambda x:remove_X(x))
test['Consumer complaint narrative'] = test['Consumer complaint narrative'].apply(lambda x:remove_X(x))
train['Consumer complaint narrative'].values[18]

'I initially obtained a loan in for a car lease for a I obtained a loan from as of it was changed to I am in Predatory loan and need help out I am upside down in this loan from they were going to report me to the credit agency for 2700 yes dollars When you ask for extension they put so much interest on the deferred payment plan I dont think I will ever be done paying for this loan Please Help'

In [None]:
def remove_digits(text):
    digitfree = ''.join(i for i in text if not i.isdigit())
    digitfree = re.sub(r' {2,}|\t+', ' ', digitfree)
    return digitfree

train['Consumer complaint narrative'] = train['Consumer complaint narrative'].apply(lambda x:remove_digits(x))
test['Consumer complaint narrative'] = test['Consumer complaint narrative'].apply(lambda x:remove_digits(x))
train['Consumer complaint narrative'].values[18]

'I initially obtained a loan in for a car lease for a I obtained a loan from as of it was changed to I am in Predatory loan and need help out I am upside down in this loan from they were going to report me to the credit agency for yes dollars When you ask for extension they put so much interest on the deferred payment plan I dont think I will ever be done paying for this loan Please Help'

In [None]:
train['Consumer complaint narrative']= train['Consumer complaint narrative'].apply(lambda x: x.lower())
test['Consumer complaint narrative']= test['Consumer complaint narrative'].apply(lambda x: x.lower())
train['Consumer complaint narrative'].values[18]

'i initially obtained a loan in for a car lease for a i obtained a loan from as of it was changed to i am in predatory loan and need help out i am upside down in this loan from they were going to report me to the credit agency for yes dollars when you ask for extension they put so much interest on the deferred payment plan i dont think i will ever be done paying for this loan please help'

In [None]:
#defining function for tokenization
def tokenization(text):
    tokens = re.split(' ', text)
    tokens = [token for token in tokens if token]
    return tokens
#applying function to the column
train['Consumer complaint narrative']= train['Consumer complaint narrative'].apply(lambda x: tokenization(x))
test['Consumer complaint narrative']= test['Consumer complaint narrative'].apply(lambda x: tokenization(x))
print(', '.join(map(repr, train['Consumer complaint narrative'].values[18]))) # use this function to show items horizontally, rather than vertically

'i', 'initially', 'obtained', 'a', 'loan', 'in', 'for', 'a', 'car', 'lease', 'for', 'a', 'i', 'obtained', 'a', 'loan', 'from', 'as', 'of', 'it', 'was', 'changed', 'to', 'i', 'am', 'in', 'predatory', 'loan', 'and', 'need', 'help', 'out', 'i', 'am', 'upside', 'down', 'in', 'this', 'loan', 'from', 'they', 'were', 'going', 'to', 'report', 'me', 'to', 'the', 'credit', 'agency', 'for', 'yes', 'dollars', 'when', 'you', 'ask', 'for', 'extension', 'they', 'put', 'so', 'much', 'interest', 'on', 'the', 'deferred', 'payment', 'plan', 'i', 'dont', 'think', 'i', 'will', 'ever', 'be', 'done', 'paying', 'for', 'this', 'loan', 'please', 'help'


In [None]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
train['Consumer complaint narrative']= train['Consumer complaint narrative'].apply(lambda x:remove_stopwords(x))
test['Consumer complaint narrative']= test['Consumer complaint narrative'].apply(lambda x:remove_stopwords(x))
print(', '.join(map(repr, train['Consumer complaint narrative'].values[18])))

'initially', 'obtained', 'loan', 'car', 'lease', 'obtained', 'loan', 'changed', 'predatory', 'loan', 'need', 'help', 'upside', 'loan', 'going', 'report', 'credit', 'agency', 'yes', 'dollars', 'ask', 'extension', 'put', 'much', 'interest', 'deferred', 'payment', 'plan', 'dont', 'think', 'ever', 'done', 'paying', 'loan', 'please', 'help'


In [None]:
def remove_nonascii(text):
    ascii_list = []
    for word in text:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        ascii_list.append(new_word)
    return ascii_list

train['Consumer complaint narrative']= train['Consumer complaint narrative'].apply(lambda x:remove_stopwords(x))
test['Consumer complaint narrative']= test['Consumer complaint narrative'].apply(lambda x:remove_stopwords(x))
print(', '.join(map(repr, train['Consumer complaint narrative'].values[18])))

'initially', 'obtained', 'loan', 'car', 'lease', 'obtained', 'loan', 'changed', 'predatory', 'loan', 'need', 'help', 'upside', 'loan', 'going', 'report', 'credit', 'agency', 'yes', 'dollars', 'ask', 'extension', 'put', 'much', 'interest', 'deferred', 'payment', 'plan', 'dont', 'think', 'ever', 'done', 'paying', 'loan', 'please', 'help'


In [None]:
train['Consumer complaint narrative']= train['Consumer complaint narrative'].apply(lambda x: " ".join(x))
test['Consumer complaint narrative']= test['Consumer complaint narrative'].apply(lambda x: " ".join(x))
train['Consumer complaint narrative'].values[18]

'initially obtained loan car lease obtained loan changed predatory loan need help upside loan going report credit agency yes dollars ask extension put much interest deferred payment plan dont think ever done paying loan please help'

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

total_tokens = 0
total_chars = 0
all_tokens = []

all_df = train.append(test)
# iterate through each document in the "Consumer complaint narrative" column
for i, row in tqdm(all_df.iterrows(), total=all_df.shape[0]):
    # tokenize the document
    tokens = tokenizer.encode(row["Consumer complaint narrative"], max_length=2048, truncation=True)
    # add tokens to list
    all_tokens.extend(tokens)
    # add the number of tokens to the total
    total_tokens += len(tokens)
    # add the number of characters to the total
    total_chars += len(row["Consumer complaint narrative"])

# calculate the average number of tokens and characters per document
avg_tokens = total_tokens / len(all_df)
avg_chars = total_chars / len(all_df)

print("\nAverage tokens per document:", avg_tokens)
print("Average characters per document:", avg_chars)

100%|██████████| 94260/94260 [02:33<00:00, 612.81it/s]


Average tokens per document: 106.96298535964354
Average characters per document: 707.9141311266709





In [None]:
print("Number of unique tokens in dataset:", len(set(all_tokens)))
print("Reduced dimensionality by " + str(round(((1-len(set(all_tokens))/unique_tokens))*100,2)) + "%")

Number of unique tokens in dataset: 23486
Reduced dimensionality by 30.96%


In [None]:
train.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/train.csv", sep='\t', index=False)
test.to_csv("/content/drive/MyDrive/Bachelor Thesis/Datasets/test.csv", sep='\t', index=False)