Support Vector machine model, using TF-IDF to convert text to vectors

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import spacy

# Exporting model
import joblib

# Stop words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Word normalizer
from nltk.stem import WordNetLemmatizer
from nltk.metrics import edit_distance
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("words")
from spellchecker import SpellChecker

# Cleaning unneeded repetitive words
import re

Import data from the data file, that is produced by the script

In [None]:
data = pd.read_sql_query("SELECT * FROM 'mortgage complaints'", "sqlite:///StaterData.db", parse_dates={'Date received': '%Y-%m-%d %H:%M:%S', 'Date sent to company': '%Y-%m-%d %H:%M:%S'})

# Cleaning
data[['Timely response?','Consumer disputed?']] = data[['Consumer disputed?','Timely response?']].replace({'Yes': True, 'No':False}).astype(bool)
data['Consumer consent provided?'] = data['Consumer consent provided?'].replace({'Consent provided': True, '':False}).astype(bool)
data["Consumer complaint narrative"] = data[data['Consumer complaint narrative'].notna()]
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].str.lower()

data = data.drop(columns=['Sub-issue'])

Removing stopwords, by removing stopwords the algorithm will focus on the important words

In [None]:
stop_words = set(stopwords.words('english'))

remove_stopwords = lambda text: " ".join([token for token in nltk.word_tokenize(text) if token.lower() not in stop_words])

data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(remove_stopwords)

Even though the dataset is anonymised the model cant train on names, because of ethics. It could be that if the model is trained on a different dataset with names that some people will get a disadvantage.

In [None]:
nlp = spacy.load('en_core_web_md')
def remove_names(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            text = text.replace(ent.text, '')
    return text

data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(remove_names)

Remove text that doesn't add context, like hashtags, urls, commas, etc..

In [None]:
def clean_text(text):
    # Remove numerical values
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove links and URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove leading/trailing white space and convert to lowercase
    text = text.strip().lower()
    
    return text

data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(clean_text)

Normalising words, making every word the normal tense. That gives the algorithm a better opportunity to create patterns

In [None]:
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

def lemmatize_word(word, tag):

    if tag.startswith('J'):
        # Adjective
        wn_tag = 'a'
    elif tag.startswith('V'):
        # Verb
        wn_tag = 'v'
    elif tag.startswith('N'):
        # Noun
        wn_tag = 'n'
    elif tag.startswith('R'):
        # Adverb
        wn_tag = 'r'
    else:
        wn_tag = None
    
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
    else:
        lemma = word
    
    return lemma

def lemmatize_sentence(sentence):
    # Tokenize the sentence into words
    tokens = nltk.word_tokenize(sentence)
    
    # Part-of-speech (POS) tag each word
    pos_tags = nltk.pos_tag(tokens)
    
    # Iterate over each word and perform spell correction
    corrected_tokens = [spell.correction(word) for word, _ in pos_tags]
    
    # Lemmatize each corrected word based on its POS tag
    lemmas = [lemmatize_word(word, tag) for word, tag in zip(corrected_tokens, pos_tags)]
    
    # Join the lemmas back into a sentence
    lemmatized_sentence = ' '.join(lemmas)
    
    return lemmatized_sentence


data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lemmatize_sentence)

Tokenization, set the words into sets

In [None]:
data['Consumer complaint narrative']= [word_tokenize(entry) for entry in data['Consumer complaint narrative']]

Removing the anonymized data, that have been changed into x's

In [None]:
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lambda x: [re.sub('[^a-zA-Z]+', '', word) for word in x])
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lambda x: [word for word in x if not re.match('^x+$', word)])

Split the data into training and testing sets.

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(data['Consumer complaint narrative'], data['Issue'], test_size=0.1)

train_data = [' '.join(tokens) for tokens in train_data]
test_data = [' '.join(tokens) for tokens in test_data]


Create a TF-IDF vectorizer and fit

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern=r'\b[a-zA-Z]+\b',
    analyzer="word",
    use_idf=True,
    smooth_idf=True,
    norm=None,
    tokenizer=None,
    preprocessor=None
)
vectorizer.fit(train_data)

Define the best settings, this code is only run once and gives the best options for the settings for SVM

In [None]:
# # Create a support vector machine classifier.
# clf = SVC(kernel='linear')

# # Define the parameters to be tuned.
# parameters = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto'],
#     'class_weight': ['balanced', None],
# }

# # Create a GridSearchCV object to search over the parameter grid.
# grid_search = GridSearchCV(clf, parameters, cv=5)

# # Fit the GridSearchCV object to the training data.
# grid_search.fit(train_tfidf_vectors, train_labels)

# # Print the best hyperparameters and the corresponding mean cross-validated score.
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

Creating SVM and fitting the vectorizors

In [None]:
# Create a support vector machine classifier.
clf = SVC(C=10, class_weight='balanced', gamma='scale', kernel='linear')

# Fit the vectorizer on the training data
train_tfidf_vectors = vectorizer.fit_transform(train_data)

# Transform the testing data using the fitted vectorizer
test_tfidf_vectors = vectorizer.transform(test_data)


Train the classifier on the TF-IDF vectors. Takes the longest time

In [None]:
clf.fit(train_tfidf_vectors, train_labels)

Predicting and exporting for long term usage

In [None]:
# Predict the labels of the testing data
pred_labels = clf.predict(test_tfidf_vectors)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(test_labels, pred_labels, normalize=True)
count = data.shape[0] // 1000

# Export the model to a file.
filename = f"model({count}K, {accuracy:.1%}).joblib"
joblib.dump(clf, filename)

print(f"Exported to: {filename}")

Create classification report for more insights

In [None]:
# Predict the labels for the test data.
predictions = clf.predict(test_tfidf_vectors)

# Create and print the classification report
report = classification_report(test_labels, predictions, zero_division=1)
print(report)

Test the model with a question

In [None]:
# Define a new question
new_question = "I need the credit score, but the website doesnt load"

# Transform the new question using the TF-IDF vectorizer
new_question_vector = vectorizer.transform([new_question])

# Use the classifier to predict the label for the new question
predicted_label = clf.predict(new_question_vector)

# Print the predicted label
print(predicted_label)

The model is exported to a file and gives an accuracy of 59%. This is close to the other models