In [26]:

import pandas as pd
import pickle as pkl

import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier



# Natural language processing

In this part of the assigment we are going to look at the comments which have been provided with the bank ratings. \
The goal is to find connections between the overal satisfaction of the customers and the comments they have provided. \
Aswell as to attempt to sort the comments into buckets depending on their content.

As we already have cleaned the the text data by removing the stop words, punctuation and spelling mistakes, we can start with the analysis. \
But we will also keep an original copy of the comments for comparison.
   

## Overall satisfaction 

First we will look at the overall satisfaction of the customers. \
We can then use different models to predict the overall satisfaction based on the comments.

1. Logistic regression
2. Naive Bayes
    - Gaussian
    - Multinomial
    - Bernoulli
3. Support Vector Machine
4. Random Forest
5. K Nearest Neighbors
6. Decision Tree
7. Gradient Boosting
8. Neural Network

We will use the original comments and the cleaned comments to see if there is a difference in the results.


In [27]:
# Loading the data
customer_data = pkl.load(open('./bank-data/cleaned_customers.pkl', 'rb'))

# dropping the columns that are not required
customer_data.drop(['customer_gender','customer_age','date', 'customer_location', 'customer_type', 'has_cc', 'has_mortgage','customer_age_norm'], axis=1, inplace=True)

# converting the True and False values to 1 and 0
customer_data = customer_data.replace({True: 1, False: 0})

In [28]:
unprepared = pkl.load(open('./bank-data/cleaned_datasets.pkl', 'rb'))
datasets = []

for dataset in unprepared:
    data = pd.merge(customer_data,  unprepared[dataset].drop(["date"], axis=1), on='customer_id', how='inner').drop(['customer_id'], axis=1)

    
    data.drop(['convenience', 'customer_service', 'online_banking', 'interest_rates', 'fees_charges', 'community_involvement', 'products_services', 'privacy_security', 'reputation'], axis=1, inplace=True)
    
    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(data['comments'])

    # Converting the sparse matrix to a dataframe
    X = pd.DataFrame.sparse.from_spmatrix(X)

    # Concatenating the dataframes
    data = pd.concat([data, X], axis=1)
    data.attrs["name"] = dataset

    datasets.append(data)

In [29]:
modles = {}

modles['LR'] =  {"modle": LogisticRegression()}
modles['NB'] =  {"modle": GaussianNB()}
modles['MNB'] = {"modle": MultinomialNB()}
modles['BNB'] = {"modle": BernoulliNB()}
modles['SVM'] = {"modle": SVC()}
modles['RF'] =  {"modle": RandomForestClassifier()}
modles['KNN'] = {"modle": KNeighborsClassifier()}
modles['DT'] =  {"modle": DecisionTreeClassifier()}
modles['GB'] =  {"modle": GradientBoostingClassifier()}
modles['MLP'] = {"modle": MLPClassifier()}


for dataset in datasets:

    X = data.drop(['satisfied', "comments"], axis=1)
    y = data['satisfied']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    X_train_dense =X_train.values
    X_test_dense = X_test.values
    
    
    for modle in modles:
        modles[modle][dataset.attrs["name"]] = {}
        
        modles[modle]['modle'].fit(X_train_dense, y_train)

        y_pred = modles[modle]['modle'].predict(X_test_dense)

        modles[modle][dataset.attrs["name"]]['accuracy'] = accuracy_score(y_test, y_pred)
        modles[modle][dataset.attrs["name"]]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
        modles[modle][dataset.attrs["name"]]['classification_report'] = classification_report(y_test, y_pred)



In [30]:
for daset in datasets:
    model_names = []
    confusion_matrices = []

    for model in modles:
        model_names.append(model)
        confusion_matrices.append(modles[model][daset.attrs["name"]]['confusion_matrix'])

    # Create subplots for True Positive (TP), True Negative (TN), False Positive (FP), and False Negative (FN)
    fig = make_subplots(rows=2, cols=2, subplot_titles=("True Positive", "True Negative", "False Positive", "False Negative"))

    # Add bar plots for each confusion matrix value
    for i, model_name in enumerate(model_names):
        cm = confusion_matrices[i]
        fig.add_trace(go.Bar(x=[model_name], y=[cm[0][0]], name=model_name, showlegend=False), row=1, col=1)  # TP
        fig.add_trace(go.Bar(x=[model_name], y=[cm[0][1]], name=model_name, showlegend=False), row=1, col=2)  # TN
        fig.add_trace(go.Bar(x=[model_name], y=[cm[1][0]], name=model_name, showlegend=False), row=2, col=1)  # FP
        fig.add_trace(go.Bar(x=[model_name], y=[cm[1][1]], name=model_name, showlegend=False), row=2, col=2)  # FN

    # Update layout and axis titles
    fig.update_layout(title=f"Confusion Matrix Comparison (Dataset {daset.attrs['name']})", barmode="group")
    fig.update_xaxes(title_text="Models", row=1, col=1)
    fig.update_xaxes(title_text="Models", row=1, col=2)
    fig.update_xaxes(title_text="Models", row=2, col=1)
    fig.update_xaxes(title_text="Models", row=2, col=2)

    fig.update_yaxes(title_text="Count", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=2, col=1)
    fig.update_yaxes(title_text="Count", row=2, col=2)

    # Show plot
    fig.show()

## Natural language processing

Here we can see both the confusion matrix and the accuracy score for each model. \
Most of the models have a very high accuracy score, even higher that the models based on the scoring. \

On average we can say that the SVM has quite a good score but gets beaten by the Neural Network, when combined with the stopword dataset. \
But here we are at such a high accuracy score that it is hard to say which model is the best, as they are all very close to each other.

Generaly the Naive Bayes models have a much worse performance here than in the scoring part. \
Same goes for the KNN model, which has a comparatively low accuracy score.\

But which is still on par with the scoring part of task 3.

In [31]:
for daset in datasets:
    model_accuracies = []
    for model in modles:
        accuracy = modles[model][daset.attrs["name"]]['accuracy']
        model_accuracies.append({"Model": model, "Accuracy": accuracy})
    
    df = pd.DataFrame(model_accuracies)
    
    # Color scale for the bar chart
    color_scale = px.colors.sequential.Bluered
    
    # Normalize accuracy values between 0 and 1
    df['NormalizedAccuracy'] = (df['Accuracy'] - df['Accuracy'].min()) / (df['Accuracy'].max() - df['Accuracy'].min())
    
    # Map normalized accuracy to color scale
    df['Color'] = df['NormalizedAccuracy'].apply(lambda x: color_scale[int(x * (len(color_scale) - 1))])
    
    # Create a Plotly bar chart
    fig = go.Figure()
    
    for index, row in df.iterrows():
        fig.add_trace(go.Bar(x=[row['Model']], y=[row['Accuracy']], text=[row['Accuracy']], textposition='auto',
                             marker_color=row['Color'], name=row['Model']))
    
    fig.update_layout(title_text=f'Accuracy Comparison of Different Models (Dataset {daset.attrs["name"]})', xaxis_title='Models', yaxis_title='Accuracy',
                      yaxis=dict(range=[0.8, 1], tickformat=".2%"))
    
    # Show the graph
    fig.show()

# Conclusion

Task 5. Conclusions
• Discuss and compare the overall performance of the two different data
representations (i.e. tabular and text) for customer satisfaction classification.
• At the end of your notebook, summarise any insights which you gained from
your analysis of the data, discuss the challenges faced, and suggest ideas for
further analysis/classification which could be performed on the data.


## Overall performance

The performance of both models is quite good with the tabular data having an accuracy of 0.826 and the text data having an accuracy of 0.961. \
Concerning that both had less than 3000 samples to train on, this is quite good. \ 
The text having a better performance makes sense as it has more information over all than the tabular data. \
But as it is inherently less structured it takes more compute power and more fine tuning than the tabular data. 

While both work quite well, the tabular data is easier to work with and can be used to get a quick overview of the data. \
As it is much easier to visualize and more intuitive to understand. \
On the other hand text data is much easier to come by and can be used to get more information out of the data. \ 
Which gives it a higher potential for further analysis.

## Insights

Many of the insights have already been discussed on the way. But to summarize: 

Most of the bank's customers are between 30 and 60 years old, with Personal customers being more numerous than Business customers. \
Customer satisfaction is low, with only 42% of customers being satisfied. \
The most important factor affecting satisfaction is the "Online banking" dimension. \
Business customers are generally more satisfied than Personal customers, \
and customers without a mortgage are more satisfied than those with a mortgage.

During the analysis of the tabular data the support vector machine performed the best\
especially in combination with the datasets which had not been imputed. 

The text data itself also is best analyised with a neural network. \
While the Support Vector Machine performed well with the many of the differnt datasets, \
the neural network performed best with the unlemmatized data, which had been cleaned of stop words and punctuation and spelling mistakes.


## Challenges

The biggest challenge was how to handle the many different variables and parameters to change. \
My approach was to simply try out many different combinations and see which one performed best. \
Which increased the overall complexity of the task by a lot.

## 
Future analysis would include further exploration of the many diffferent parameters and variables. \
For example the different kernels of the support vector machine, or the different activation functions of the neural network. \
Aswell as the different parameters of the neural network, such as the number of layers and the number of neurons per layer. 

Additionally a more sufisticated approach to the imputation of the missing values could be used. \
Utilizing the information of the other variables to make a better guess at the missing values.
