<a href="https://colab.research.google.com/github/Narennnnn/narendra_maurya_trustpilot_reviews/blob/main/narendra_maurya_truspilot_reviews_improvised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [177]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.over_sampling import SMOTENC

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv('/content/[Data] Novo Reviews - Novo\'s Trust Pilot Ratings.csv')
# Cleaning and preprocessing
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
if 'Issue' in data.columns:
    data = data.drop(columns=['Issue'])

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

data['processed_text'] = data['Review text'].apply(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [178]:
print(data.head())

                   rating                 rating title  \
0  Rated 5 out of 5 stars    Great bank for businesses   
1  Rated 3 out of 5 stars  Buggy when it matters most.   
2  Rated 5 out of 5 stars         Love digital banking   
3  Rated 3 out of 5 stars    Poor support and no wires   
4  Rated 4 out of 5 stars        Really easy to set up   

                                         Review text   Review date  \
0  It was easy to set up, with no hassle like som...  Feb 15, 2024   
1  Unfortunately I’m probably going to figure out...  Feb 19, 2024   
2  Love digital banking I keep now all my busines...   Feb 1, 2024   
3  A decent basic free business bank, but have re...   Feb 8, 2024   
4  Really easy to set up. Works as intended most ...  Jan 10, 2024   

  Date of Experience  rating_procesed  Year of review   Year of experience  \
0  December 01, 2023                5             2024                2023   
1  February 18, 2024                3             2024                2024

In [179]:
intent_categories = {
    'Account -> Lost password or Incorrect Password': ['password', 'login', 'account'],
    'Checks -> Mobile deposits -> Void checks': ['check', 'deposit', 'void'],
    'Debit card -> Declined -> Unauthorized transactions -> fraud': ['unauthorized', 'fraud'],
    'Invoices -> sent -> unpaid': ['invoice', 'unpaid'],
    'Invoices -> sent -> paid': ['invoice', 'paid'],
}


def assign_intent(text):
    intents = []
    for intent, keywords in intent_categories.items():
        if any(keyword in text for keyword in keywords):
            intents.append(intent)
    return intents

data['intents'] = data['processed_text'].apply(assign_intent)


In [180]:
# Initializing MultiLabelBinarizer for multi-label classification targets (y)
# and TF-IDF vectorizer for text feature extraction (X).

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['intents'])

tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(data['processed_text'])


In [181]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
forest_clf = RandomForestClassifier(random_state=42)

In [182]:

# Training the model
forest_clf.fit(X_train, y_train)

In [183]:
# Predict on the test data
y_pred = forest_clf.predict(X_test)

In [184]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=1))

Accuracy: 0.95
                                                              precision    recall  f1-score   support

              Account -> Lost password or Incorrect Password       1.00      0.98      0.99       190
                    Checks -> Mobile deposits -> Void checks       1.00      0.92      0.96        91
Debit card -> Declined -> Unauthorized transactions -> fraud       1.00      0.32      0.49        28
                                    Invoices -> sent -> paid       1.00      0.57      0.73        21
                                  Invoices -> sent -> unpaid       1.00      0.69      0.81        16

                                                   micro avg       1.00      0.87      0.93       346
                                                   macro avg       1.00      0.70      0.80       346
                                                weighted avg       1.00      0.87      0.92       346
                                                 samples avg     

In [187]:
user_inputs = [
    "I'm having trouble logging into my account. It says my password is incorrect.",
    "I tried to deposit a check using the mobile app but it didn't work.",
    "There's a dispute over an unpaid invoice I received.",
    "I want to pay an invoice that's pending.",
    "I forgot my account login details.",
    "I attempted to void a check but couldn't complete the process.",
    "I believe there are unauthorized transactions on my debit card.",
    "I need to confirm the status of an unpaid invoice.",
]
print("Predicted intents/categories for each input:")
for user_input in user_inputs:
    processed_input = preprocess_text(user_input)
    # Vectorize user input
    input_vectorized = tfidf_vectorizer.transform([processed_input])
    # Predict using the trained model
    predictions = forest_clf.predict(input_vectorized)
    # Inverse transform predictions to get intent labels
    predicted_labels = mlb.inverse_transform(predictions)

    print(f"Input: {user_input}")
    print("Predicted intents/categories:")
    for label in predicted_labels[0]:
        print("-", label)
    print()

Predicted intents/categories for each input:
Input: I'm having trouble logging into my account. It says my password is incorrect.
Predicted intents/categories:
- Account -> Lost password or Incorrect Password

Input: I tried to deposit a check using the mobile app but it didn't work.
Predicted intents/categories:
- Checks -> Mobile deposits -> Void checks

Input: There's a dispute over an unpaid invoice I received.
Predicted intents/categories:
- Invoices -> sent -> paid
- Invoices -> sent -> unpaid

Input: I want to pay an invoice that's pending.
Predicted intents/categories:
- Invoices -> sent -> paid
- Invoices -> sent -> unpaid

Input: I forgot my account login details.
Predicted intents/categories:
- Account -> Lost password or Incorrect Password

Input: I attempted to void a check but couldn't complete the process.
Predicted intents/categories:
- Checks -> Mobile deposits -> Void checks

Input: I believe there are unauthorized transactions on my debit card.
Predicted intents/cate