# Support Vector machine model

Creating a Support vector machine model based on the dataset provided from the data retriever script. In this script the data is already optimized and cleaned for the model to use.

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

Import data from the data file, that is produced by the script

In [None]:
data = pd.read_sql_query("SELECT * FROM 'mortgage complaints'", "sqlite:///StaterData.db")


Split the data into training and testing sets.

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(data['Consumer complaint narrative'], data['Issue'], test_size=0.1)

train_data = [' '.join(tokens) for tokens in train_data]
test_data = [' '.join(tokens) for tokens in test_data]


Create a TF-IDF vectorizer and fit

In [None]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    token_pattern=r'\b[a-zA-Z]+\b',
    analyzer="word",
    use_idf=True,
    smooth_idf=True,
    norm=None,
    tokenizer=None,
    preprocessor=None
)
vectorizer.fit(train_data)

Define the best settings, this code is only run once and gives the best options for the settings for SVM

In [None]:
# clf = SVC(kernel='linear')

# parameters = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto'],
#     'class_weight': ['balanced', None],
# }

# grid_search = GridSearchCV(clf, parameters, cv=5)
# grid_search.fit(train_tfidf_vectors, train_labels)

# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

Creating SVM and fitting the vectorizors

In [None]:
clf = SVC(C=10, class_weight='balanced', gamma='scale', kernel='linear')

train_tfidf_vectors = vectorizer.fit_transform(train_data)
test_tfidf_vectors = vectorizer.transform(test_data)


Train the classifier on the TF-IDF vectors. Takes the longest time

In [None]:
clf.fit(train_tfidf_vectors, train_labels)

Predicting and exporting for long term usage

In [None]:
pred_labels = clf.predict(test_tfidf_vectors)

accuracy = accuracy_score(test_labels, pred_labels, normalize=True)
count = data.shape[0] // 1000

print(f"Accuracy: {accuracy * 100:.2f}% with({count}k samples)")

Create classification report for more insights

In [None]:
predictions = clf.predict(test_tfidf_vectors)
report = classification_report(test_labels, predictions, zero_division=1)
print(report)

# Summary
The model is exported to a file and gives an accuracy of 59% based on the 104000 rows of data. This is close to the other models.
The accuracy for this model is close but higher compared to the other models as they use the same dataset.