In [1]:
#Importing the libraries
import pandas as pd
import ssl

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# Exporting model
import joblib

# Stop words
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Word normalizer
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Cleaning unneeded repetitive words
import re

ssl._create_default_https_context = ssl._create_unverified_context

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeroendenotter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeroendenotter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jeroendenotter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeroendenotter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jeroendenotter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
DS1_data = pd.read_sql_query("SELECT * FROM 'mortgage complaints'", "sqlite:///StaterData.db", parse_dates={'Date received': '%Y-%m-%d %H:%M:%S', 'Date sent to company': '%Y-%m-%d %H:%M:%S'})

# Cleaning
DS1_data[['Timely response?','Consumer disputed?']] = DS1_data[['Consumer disputed?','Timely response?']].replace({'Yes': True, 'No':False}).astype(bool)
DS1_data['Consumer consent provided?'] = DS1_data['Consumer consent provided?'].replace({'Consent provided': True, '':False}).astype(bool)
DS1_data[DS1_data['Consumer complaint narrative'].notna()]
DS1_data['Consumer complaint narrative'] = DS1_data['Consumer complaint narrative'].str.lower()

data = DS1_data.drop(columns=['Sub-issue'])

In [3]:
# Sample from it
# data = DS1_data.sample(n=250)
data["Issue"].value_counts()

Trouble during payment process                                                      33782
Struggling to pay mortgage                                                          17439
Loan servicing, payments, escrow account                                            14721
Loan modification,collection,foreclosure                                            10789
Applying for a mortgage or refinancing an existing mortgage                         10490
Closing on a mortgage                                                                7408
Application, originator, mortgage broker                                             3746
Settlement process and costs                                                         2249
Incorrect information on your report                                                 1420
Credit decision / Underwriting                                                       1289
Problem with a credit reporting company's investigation into an existing problem      622
Improper u

In [4]:
# Stopwords
stop_words = set(stopwords.words('english'))

remove_stopwords = lambda text: " ".join([token for token in nltk.word_tokenize(text) if token.lower() not in stop_words])

data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(remove_stopwords)

In [5]:
# Noise free text
# Removing hashtags, urls, commas, etc.
def clean_text(text):
    # Remove numerical values
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove links and URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove leading/trailing white space and convert to lowercase
    text = text.strip().lower()
    
    return text

data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(clean_text)

In [6]:
# Normalizing words, (play, playing, played) -> play
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize a single word based on its part of speech (POS) tag
def lemmatize_word(word, tag):
    # Map POS tag to WordNet POS tag
    if tag.startswith('J'):
        # Adjective
        wn_tag = 'a'
    elif tag.startswith('V'):
        # Verb
        wn_tag = 'v'
    elif tag.startswith('N'):
        # Noun
        wn_tag = 'n'
    elif tag.startswith('R'):
        # Adverb
        wn_tag = 'r'
    else:
        wn_tag = None
    
    # Lemmatize the word
    if wn_tag:
        lemma = lemmatizer.lemmatize(word, wn_tag)
    else:
        lemma = word
    
    return lemma

# Define a function to lemmatize a sentence
def lemmatize_sentence(sentence):
    # Tokenize the sentence into words
    tokens = nltk.word_tokenize(sentence)
    
    # Part-of-speech (POS) tag each word
    pos_tags = nltk.pos_tag(tokens)
    
    # Lemmatize each word based on its POS tag
    lemmas = [lemmatize_word(word, tag) for word, tag in pos_tags]
    
    # Join the lemmas back into a sentence
    lemmatized_sentence = ' '.join(lemmas)
    
    return lemmatized_sentence


data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lemmatize_sentence)

In [7]:
# Tokenization, creating into sets of words
data['Consumer complaint narrative']= [word_tokenize(entry) for entry in data['Consumer complaint narrative']]

In [8]:
# Remove only tokenized words that are not alphabetic or only x
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lambda x: [re.sub('[^a-zA-Z]+', '', word) for word in x])
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].apply(lambda x: [word for word in x if not re.match('^x+$', word)])

In [9]:
# Split the data into training and testing sets.
train_data, test_data, train_labels, test_labels = train_test_split(data['Consumer complaint narrative'], data['Issue'], test_size=0.1)

train_data = [' '.join(tokens) for tokens in train_data]
test_data = [' '.join(tokens) for tokens in test_data]


In [10]:
# Create a TF-IDF vectorizer.
vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern=r'\b[a-zA-Z]+\b',
    analyzer="word",
    use_idf=True,
    smooth_idf=True,
    norm=None,
    tokenizer=None,
    preprocessor=None
)
# Fit the vectorizer to the training data.
vectorizer.fit(train_data)

TfidfVectorizer(norm=None, stop_words='english',
                token_pattern='\\b[a-zA-Z]+\\b')

In [11]:
# Create a TF-IDF vectorizer object with unigrams, bigrams, and trigrams as features
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Create a support vector machine classifier.
clf = SVC(C=10, class_weight='balanced', gamma='scale', kernel='linear')

# Fit the vectorizer on the training data
train_tfidf_vectors = vectorizer.fit_transform(train_data)

# Transform the testing data using the fitted vectorizer
test_tfidf_vectors = vectorizer.transform(test_data)


In [12]:
# # Create a support vector machine classifier.
# clf = SVC(kernel='linear')

# # Define the parameters to be tuned.
# parameters = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto'],
#     'class_weight': ['balanced', None],
# }

# # Create a GridSearchCV object to search over the parameter grid.
# grid_search = GridSearchCV(clf, parameters, cv=5)

# # Fit the GridSearchCV object to the training data.
# grid_search.fit(train_tfidf_vectors, train_labels)

# # Print the best hyperparameters and the corresponding mean cross-validated score.
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

In [13]:
# Train the classifier on the TF-IDF vectors.
clf.fit(train_tfidf_vectors, train_labels)

SVC(C=10, class_weight='balanced', kernel='linear')

In [20]:
# Predict the labels of the testing data
pred_labels = clf.predict(test_tfidf_vectors)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(test_labels, pred_labels, normalize=True)
count = data.shape[0] // 1000

# Export the model to a file.
filename = f"model({count}K, {accuracy:.1%}).joblib"
joblib.dump(clf, filename)

print(f"Exported to: {filename}")

Exported to: model(104K, 58.5%).joblib


In [19]:
# Predict the labels for the test data.
predictions = clf.predict(test_tfidf_vectors)

# Create and print the classification report
report = classification_report(test_labels, predictions, zero_division=1)
print(report)

                                                                                  precision    recall  f1-score   support

                                        Application, originator, mortgage broker       0.37      0.14      0.21       366
                     Applying for a mortgage or refinancing an existing mortgage       0.54      0.62      0.58      1054
                                                           Closing on a mortgage       0.48      0.47      0.48       742
                                                  Credit decision / Underwriting       0.33      0.01      0.01       140
                         Credit monitoring or identity theft protection services       1.00      0.25      0.40         4
                                                     Improper use of your report       0.43      0.14      0.21        21
                                            Incorrect information on your report       0.34      0.37      0.35       156
                       

In [None]:
# Define a new question
new_question = "I need the credit score, but the website doesnt load"

# Transform the new question using the TF-IDF vectorizer
new_question_vector = vectorizer.transform([new_question])

# Use the classifier to predict the label for the new question
predicted_label = clf.predict(new_question_vector)

# Print the predicted label
print(predicted_label)