In [None]:
# importing the csv library
import csv
 
# opening the csv file by specifying
# the location
# with the variable name as csv_file
with open('../dataset/complaints.csv') as csv_file:
 
    # creating an object of csv reader
    # with the delimiter as ,
    csv_reader = csv.reader(csv_file, delimiter = ',')
 
    # list to store the names of columns
    list_of_column_names = []
 
    # loop to iterate through the rows of csv
    for row in csv_reader:
 
        # adding the first row
        list_of_column_names.append(row)
 
        # breaking the loop after the
        # first iteration itself
        break
 
# printing the result
print("List of column names : ",
      list_of_column_names[0])

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# load the data
data = pd.read_csv('../dataset/complaints.csv', low_memory=False)

# drop rows with missing values
data.dropna(subset=['Consumer complaint narrative'], inplace=True)

In [None]:
# map categories to integers
category_mapping = {
    'Credit reporting, repair, or other': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}
data['Category'] = data['Product'].map(category_mapping)

In [None]:
# Exploratory Data Analysis and Feature Engineering
plt.figure(figsize=(10,6))
sns.countplot(data=data, x='Category')
plt.title('Distribution of Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

In [None]:
# Text Pre-Processing
X = data['Consumer complaint narrative']
y = data['Category']

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# create a pipeline for text pre-processing and model training
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

In [None]:
# Define hyperparameters for grid search
param_grid = {
    'tfidf__max_df': [0.25, 0.5, 0.75],
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf', 'poly'],
    'clf__gamma': ['scale', 'auto']
}

In [None]:
# Train the model using grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and accuracy score
print("Best parameters: ", grid_search.best_params_)
print("Accuracy score: ", grid_search.best_score_)