In [1]:
# Import libraries
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import string
import spacy
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Replace with name of file with customer reviews
my_file_name = "Customer Feedback Analysis H1 (1).xlsx"

In [3]:
# Load the csv file into the dataframe
review_data = pd.read_excel(my_file_name)
print(review_data.head(2))
print(len(review_data))

        Date  Rating                 Type  \
0 2023-04-30       2   Investment I Claim   
1 2023-02-23       1   Investment I Renew   

                                              Review         Classification  
0  Your withdrawal process is getting slower by t...    Delayed Liquidation  
1                    Your team isn’t doing their job  Poor Customer Service  
1828


In [4]:
# Make a copy of the original data to run it through my model FOR loop
original_data = pd.read_excel(my_file_name)
print(original_data['Review'][61])

Your App doesn't work in UK.  I am currently there and cannot use App to make my contributions.


In [6]:
# Function to clean the text
import nltk

def clean_text(text, preserved_words=None):
    if preserved_words is None:
        preserved_words = []  # If no preserved_words list is provided, initialize an empty list
    
    # Tokenize the text
    text_tokens = text.split()
    
    # Combine tokens back into a cleaned sentence while preserving specific words
    cleaned_tokens = []
    for token in text_tokens:
        # Check if the token is a preserved word (ignoring case)
        if any(token.lower() == word.lower() for word in preserved_words):
            cleaned_tokens.append(token)
        else:
            # Filter out unwanted words based on length and digits
            if not (token.isdigit() or len(token) <= 3):
                # Remove punctuation from non-preserved words
                cleaned_token = token.translate(str.maketrans('', '', string.punctuation))
                if cleaned_token:  # Check if the cleaned token is not empty
                    cleaned_tokens.append(cleaned_token)
    
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text.lower()


In [7]:
# More preprocessing on the data
review_data.dropna(axis=0, how='any', inplace=True)

preserved_words = ['app']
review_data['Review'] = review_data['Review'].apply(clean_text, preserved_words=preserved_words)

print('-------Dataset-------')
print(review_data['Rating'].value_counts())
print(len(review_data))

print(review_data['Review'][61])

-------Dataset-------
Rating
3    750
1    553
2    468
4     40
5     17
Name: count, dtype: int64
1828
your app doesnt work currently there cannot app make contributions


In [8]:
# Make a copy of the review data
df_review = review_data.copy()

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
df_review['Review']=df_review['Review'].apply(remove_stopwords)

In [9]:
# Function for lemmatization
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [10]:
text_list = df_review['Review'].tolist()

# Tokenize the words and show sample
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

# Check how the tex_list looks
text_list[:5]

['team']


['withdrawal process getting slower day someone emergency request withdrawal anymore',
 'team isn’t',
 'system failure frequent wait hours portal downtime slow response hospital approval requests better please',
 'staff isobel rude attitude needs trained customer service',
 'settlement customers claim riddled forth back arguments accompanied stress']

In [11]:
# Create Matrix Dictionary
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [12]:
# We can view the words by using the dictionary
# View words in 10 documents in the matrix
id_words = [[(dictionary[id], count) for id, count in line] for line in doc_term_matrix[0:10]]
print(id_words)

[[('day', 1), ('emergency', 1), ('process', 1), ('request', 1), ('slow', 1), ('withdrawal', 2)], [('team', 1)], [('request', 1), ('slow', 1), ('approval', 1), ('downtime', 1), ('failure', 1), ('frequent', 1), ('hospital', 1), ('hour', 1), ('portal', 1), ('response', 1), ('system', 1), ('wait', 1)], [('attitude', 1), ('customer', 1), ('rude', 1), ('service', 1)], [('customer', 1), ('argument', 1), ('settlement', 1), ('stress', 1)], [('service', 1), ('poor', 1), ('yesterday', 1)], [('customer', 1), ('service', 1), ('client', 1), ('experience', 1), ('fast', 1), ('good', 1), ('guy', 1), ('guysnwill', 1), ('home', 1), ('leg', 1), ('men', 1), ('previous', 1), ('quarter', 1), ('think', 1), ('yearsmh', 1)], [('service', 2), ('poor', 1), ('platform', 1), ('prompt', 1)], [('service', 1)], [('request', 1), ('response', 1), ('poor', 1), ('account', 1), ('clientservice', 1), ('email', 1), ('last', 1), ('mail', 1), ('staff', 1), ('week', 1)]]


In [13]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model with 20 topics
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=20, random_state=100,
                chunksize=1000, passes=50,iterations=100)

In [14]:
# Dump and load model
from gensim.models import LdaModel
# Assume 'lda_model' is your trained LDA model
saved_lda = "lda_model_saved"
lda_model.save(saved_lda)

# Load the saved LDA model
lda_model_load = LdaModel.load(saved_lda)

In [15]:
# Display model Topics
lda_model.print_topics()

[(0,
  '0.202*"service" + 0.131*"customer" + 0.119*"poor" + 0.033*"response" + 0.031*"time" + 0.021*"satisfied" + 0.014*"treatment" + 0.013*"certificate" + 0.013*"issue" + 0.013*"office"'),
 (1,
  '0.232*"liquidation" + 0.064*"much" + 0.052*"deposit" + 0.036*"staff" + 0.036*"time" + 0.020*"long" + 0.019*"important" + 0.018*"hard" + 0.016*"partial" + 0.014*"website"'),
 (2,
  '0.061*"interest" + 0.054*"investment" + 0.046*"communication" + 0.038*"need" + 0.034*"platform" + 0.029*"person" + 0.028*"rate" + 0.026*"information" + 0.025*"client" + 0.024*"dissatisfied"'),
 (3,
  '0.147*"money" + 0.136*"account" + 0.097*"fund" + 0.037*"market" + 0.036*"payment" + 0.033*"investment" + 0.033*"day" + 0.031*"redemption" + 0.030*"today" + 0.024*"week"'),
 (4,
  '0.102*"payment" + 0.055*"claim" + 0.031*"wife" + 0.025*"terrible" + 0.021*"fact" + 0.020*"experience" + 0.019*"work" + 0.019*"happy" + 0.018*"week" + 0.016*"first"'),
 (5,
  '0.081*"delay" + 0.050*"time" + 0.042*"case" + 0.038*"response" + 

In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
#vis
pyLDAvis.display(vis)

In [17]:
# Metrics for the lda_model
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000))  

# Compute Coherence Score
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.980869797422682

Coherence Score:  0.37929016511075864


Coherence and Perplexity has improved from my previous models

### Padded SVM Model 

In [18]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [19]:
# Get topic probabilities for each document in the dataset
topic_probs = []
for doc_bow in doc_term_matrix:
    doc_topics = lda_model.get_document_topics(doc_bow)
    probs = [prob for _, prob in doc_topics]
    topic_probs.append(probs)

In [20]:
# Length of topic_probs is num of docs in text
topic_probs[20]

[0.16009265, 0.62096995, 0.14810167]

In [21]:
print(len(topic_probs))
print(len(df_review.Classification))

1828
1828


In [22]:
# Convert the topic probabilities and labels to numpy arrays
X = pd.DataFrame(topic_probs).values
X[np.isnan(X)] = 0  # Replace NaN values with zero
y = df_review['Classification'].values

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM classifier
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

In [24]:
# Evaluate the SVM classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 0.2568306010928962
Classification Report:
                                       precision    recall  f1-score   support

                App/Web/Portal Issues       0.00      0.00      0.00         3
                       Approval Delay       0.25      0.96      0.40        91
                    Claims Adjustment       0.00      0.00      0.00         2
                  Delayed Certificate       0.00      0.00      0.00         2
               Delayed Claims Payment       0.00      0.00      0.00         3
                Delayed Enrollee Card       0.00      0.00      0.00         1
                Delayed Inflow Update       0.00      0.00      0.00        22
                  Delayed Liquidation       0.00      0.00      0.00        34
                   Delayed VIS update       0.00      0.00      0.00         1
                  Enrollee Card Delay       0.00      0.00      0.00         2
                  Gym/Wellness Access       0.00      0.00      0.00         2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Accuracy using only the LDA generated topics is low, so I want to add the MOT as a feature to observe the effect on accuracy

In [25]:
from sklearn.preprocessing import LabelEncoder

# Convert the topic probabilities to a DataFrame with string column names
X_lda = pd.DataFrame(topic_probs).values
X_lda[np.isnan(X_lda)] = 0  # Replace NaN values with zero

# Encode the categorical feature 'category' using label encoding
label_encoder = LabelEncoder()
df_review['Type_encoded'] = label_encoder.fit_transform(df_review['Type'])
# Convert the encoded categorical feature to a DataFrame
X_category_encoded = pd.DataFrame(df_review['Type_encoded'], columns=['Type_encoded'])

# Combine LDA features and the encoded categorical feature
X = pd.concat([pd.DataFrame(X_lda), X_category_encoded], axis=1)
# Convert column names to strings
X.columns = X.columns.astype(str)

# Set the target variable
y = df_review['Classification'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM classifier
svm = SVC(kernel='linear', C=1.0)
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the SVM classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation results
print('Accuracy:', accuracy)
print('Classification Report:')
print(report)

Accuracy: 0.28415300546448086
Classification Report:
                                       precision    recall  f1-score   support

                App/Web/Portal Issues       0.00      0.00      0.00         3
                       Approval Delay       0.31      0.93      0.46        91
                    Claims Adjustment       0.00      0.00      0.00         2
                  Delayed Certificate       0.00      0.00      0.00         2
               Delayed Claims Payment       0.00      0.00      0.00         3
                Delayed Enrollee Card       0.00      0.00      0.00         1
                Delayed Inflow Update       0.00      0.00      0.00        22
                  Delayed Liquidation       0.00      0.00      0.00        34
                   Delayed VIS update       0.00      0.00      0.00         1
                  Enrollee Card Delay       0.00      0.00      0.00         2
                  Gym/Wellness Access       0.00      0.00      0.00         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###### Adding MOT improved it a bit

So After converting the column names to strings, the code has worked and the accuracy has increased a bit.

In [27]:
# save the svm model
import joblib
model_filename = "svm_model.pkl"
joblib.dump(svm, model_filename)

['svm_model.pkl']

In [28]:
# LOAD SAVED MODELS

# Load the SVM model
svm_model_loaded = joblib.load('svm_model.pkl')

In [29]:
# Using this to test code below
test_sample = df_review[['Review', 'Type']][:400]

In [30]:
# Data used to check accuracy against predicted labels
correct_preds = df_review[['Review', 'Classification']][:400]

In [31]:
# Function to preprocess the new reviews and get LDA topic probabilities
def preprocess_new_reviews(new_reviews):
    # Make use of clean_text, remove_stopwords
    new_reviews.Review.apply(clean_text, preserved_words=preserved_words)
    new_reviews.Review.apply(remove_stopwords)
    listed = new_reviews['Review'].tolist()
    tokenized_new_reviews = lemmatization(listed)
    dictionary = corpora.Dictionary(tokenized_new_reviews)

    # Create a bag-of-words corpus for the new_reviews
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_new_reviews]

    # Get LDA topic probabilities for the new_reviews
    topic_probs = []
    for doc_bow in corpus:
        doc_topics = lda_model.get_document_topics(doc_bow)
        probs = [prob for _, prob in doc_topics]
        topic_probs.append(probs)
    
    X_data = pd.DataFrame(topic_probs).values
    X_data[np.isnan(X_data)] = 0  # Replace NaN values with zero

    return X_data

In [32]:
# Function to classify new reviews using SVM model
def classify_new_reviews(new_reviews_lda_probs, new_reviews_type):
    # Combine LDA features (new_reviews_lda_probs) with encoded 'Type' feature
    X_combined_features = pd.concat([pd.DataFrame(new_reviews_lda_probs), new_reviews_type], axis=1)
    # Convert column names to strings
    X_combined_features.columns = X_combined_features.columns.astype(str)
    # Replace NaN values with zero
    X_combined_features.fillna(0, inplace=True)

    # Make predictions using the SVM model
    predictions = svm.predict(X_combined_features)
    
    return predictions

In [33]:
new_reviews = test_sample

# Preprocess the new reviews and get LDA topic probabilities
lda_probs_new_reviews = preprocess_new_reviews(new_reviews)
print("Shape of x_data:", new_reviews.shape)
lda_probs_new_reviews[np.isnan(lda_probs_new_reviews)] = 0  # Replace NaN values with zero

# Sample new 'Type' values for the new reviews (replace this with actual data)
new_reviews_type = new_reviews.Type

# Encode the 'Type' feature using LabelEncoder
encoded_new_reviews_type = label_encoder.transform(new_reviews_type)
new_reviews_type_df = pd.DataFrame(encoded_new_reviews_type, columns=['Type_encoded'])

# Classify the new reviews using SVM model
predicted_labels = classify_new_reviews(lda_probs_new_reviews, new_reviews_type_df)

# Print the results
results_df = pd.DataFrame({
    'Review': new_reviews['Review'],
    'Type': new_reviews['Type'],
    'Predicted Label': predicted_labels
})
print(results_df)

# Read to excel file
file_path = 'output.xlsx'
results_df.to_excel(file_path, index=False)

Shape of x_data: (400, 2)
                                                Review                 Type  \
0    withdrawal process getting slower day someone ...   Investment I Claim   
1                                           team isn’t   Investment I Renew   
2    system failure frequent wait hours portal down...       Health I Claim   
3    staff isobel rude attitude needs trained custo...       Health I Claim   
4    settlement customers claim riddled forth back ...        Motor I Claim   
..                                                 ...                  ...   
395                                      response slow       Health I Claim   
396                            response team slow poor       Health I Claim   
397              redemption took long used much faster   Investment I Claim   
398  recently response timing terrible coupled inve...         Health I Buy   
399  recently liquidated account asking took foreve...   Investment I Claim   

        Predicted Label  

##### Applied Post-processing based corrections to the model to improve accuracy

In [34]:
# Trying to Hard-code some words to some labels ## Rule-based post-processing OR heuristics-based correction
def adjust_predicted_label(review_text, predicted_label):
    # Define keywords and their associated new labels
    keyword_labels = {
        'Technological Issues': {'app', 'software', 'update', 'bug', 'crash', 'error', 'glitch', 'slow', 'lag', 'freeze', 'dashboard', 'application', 'chatbot', 'website'},
        'Claim Adjustment': {'settlement', 'claims', 'claim'},
        'Delayed Liquidation': {'withdrawal', 'redemption', 'account', 'liquidation', 'investment'},
        'Policy Cover': {'bill', 'coverage', 'plan', 'HMO'},
        'Approval Delay': {'approval', 'confirmation'},
        'Interest Rate': {'interest', 'rate'},
        'Hospital QoS': {'hospital', 'clinics', 'doctor', 'doctors', 'drugs', 'drug', 'admit', 'dental', 'healthcare', 'care', 'prescription', 'prescribe'},
        'Gym/Wellness Access': {'gym', 'fitness'},
        'Slow Onboarding': {'plan', 'onboard'},
        'Enrollee Card Delay': {'card'},
        'Delayed Inflow Update': {'inflow'}, # New ones are under
        
        # Add more labels and their corresponding sets of keywords as needed
    }

    # Tokenize the review text
    review_tokens = review_text.lower().split() # Converts all to lowercase

    # Check if any keyword is present in the review text
    for label, keywords in keyword_labels.items():
        if any(keyword in review_tokens for keyword in keywords):
            return label

    # If no keyword is found, return the original predicted label
    return predicted_label

In [35]:
# Create an empty list to store the adjusted labels
adjusted_labels = []

review_texts = test_sample.Review

# Iterate through each review text and predicted label
for review_text, predicted_label in zip(review_texts, predicted_labels):
    # Apply the adjust_predicted_label function to get the adjusted label
    adjusted_label = adjust_predicted_label(review_text, predicted_label)
    
    # Append the adjusted label to the list
    adjusted_labels.append(adjusted_label)

# Convert the list of adjusted labels back to a pandas Series
adjusted_labels_series = pd.Series(adjusted_labels)

# Create a DataFrame to store the data
data_adj = pd.DataFrame({
    'Review Text': review_texts,
    'Predicted Labels': predicted_labels,
    'Adjusted Labels': adjusted_labels_series
})

# Save the DataFrame to an Excel file
data_adj.to_excel('adjusted.xlsx', index=False)

In [36]:
# Compare Accuracy of adjusted label against original labels (With Heuristics)
# Combine the predicted labels and the original reviews data
check_adj = pd.DataFrame({
    'Review': df_review['Review'][:400], # I have to use preprocessed, unless data won't merge correctly
    'Actual Labels': df_review['Classification'][:400],
    'Adjusted Labels': adjusted_labels_series,
})

# Calculate the accuracy
accuracy = (check_adj['Actual Labels'] == check_adj['Adjusted Labels']).mean()

print('Accuracy:', accuracy)

# Save the results to an Excel file
check_adj.to_excel('check_adj.xlsx', index=False)

Accuracy: 0.3825


In [37]:
# Check Accuracy of Model without Heuristics
# corrected labels is the dataframe correct_preds
# predicted labels for the new_reviews is the dataframe predicted_labels

# Combine the predicted labels and the original reviews data
check_df = pd.DataFrame({
    'Review': df_review['Review'][:400], # I have to use preprocessed, unless data won't merge correctly
    'Predicted Label': predicted_labels
})

# Merge with the corrected predicted labels to get the actual labels for comparison
check_df = check_df.merge(correct_preds, on='Review', suffixes=('_predicted', '_corrected'))

# Calculate the accuracy
accuracy = (check_df['Predicted Label'] == check_df['Classification']).mean()

print('Accuracy:', accuracy)

# Save the results to an Excel file
check_df.to_excel('check_df.xlsx', index=False)

Accuracy: 0.319047619047619
