In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT01\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT01\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT01\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KIIT01\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
df = pd.read_excel("NLP_Data.xlsx")

In [5]:
df.dropna(subset=["Description of the Grievance"], inplace=True)

In [6]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    lemmatizer = WordNetLemmatizer()
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if tag.startswith('N'):  # Noun
            lemma = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('V'):  # Verb
            lemma = lemmatizer.lemmatize(word, pos='v')
        else:
            lemma = word
        lemmatized_tokens.append(lemma)
    return " ".join(lemmatized_tokens)

In [7]:
df = df[df["Grievance Category"].notnull()]

In [8]:
df["Description of the Grievance"] = df["Description of the Grievance"].apply(preprocess_text)

In [9]:
df

Unnamed: 0,Description of the Grievance,Grievance Category,Grievance SubCategory
0,concern regard laboratori test bill twice,Billing/Financial Dispute,Provider Claim Issues
1,dassatifact provid,Quality Of Service,Not Satisfied With Provider Services
2,dissatisafact delay care,Access And Availability,Pharmacy
3,dissatisafact dental provid way conduct busi,Quality Of Service,Not Satisfied With Provider Services
4,dissatisfact inform dental benefit coverag mem...,Billing/Financial Dispute,Balance Billing
...,...,...,...
516,member appeal charg show offic visit charg tot...,Quality Of Service,Other
518,member cost member rx osphena mg tab member pa...,Benefit Package,Other
519,mbr mbr dissatisfi say sr horribl commun mbr p...,Quality Of Care,Pcp
520,member request appeal member dissatisfi member...,Quality Of Care,Other


In [10]:
# Split the data into features (X) and labels (y)
X = df["Description of the Grievance"].values
y_category = df["Grievance Category"].values
y_subcategory = df["Grievance SubCategory"].values

In [11]:
# Encode category and subcategory labels
label_encoder = LabelEncoder()
y_category_encoded = label_encoder.fit_transform(y_category)
y_subcategory_encoded = label_encoder.fit_transform(y_subcategory)

In [12]:
# Split the data into training and test sets for both category and subcategory
X_train, X_test, y_category_train, y_category_test, y_subcategory_train, y_subcategory_test = train_test_split(
    X, y_category_encoded, y_subcategory_encoded, test_size=0.1, random_state=123
)

In [15]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [16]:
# Create and train the category classifier
category_classifier = MultinomialNB()
category_classifier.fit(X_train_tfidf, y_category_train)


In [17]:
# Create and train the subcategory classifier
subcategory_classifier = MultinomialNB()
subcategory_classifier.fit(X_train_tfidf, y_subcategory_train)

In [18]:
y_category_pred = category_classifier.predict(X_test_tfidf)
y_subcategory_pred = subcategory_classifier.predict(X_test_tfidf)


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.svm import SVC
rf = RandomForestClassifier(max_depth=100,max_features=100)
rf.fit(X_train_tfidf,y_category_train)
rf_predict = rf.predict(X_test_tfidf)
metrics.accuracy_score(y_category_test,rf_predict)

0.28

In [20]:
category_accuracy = accuracy_score(y_category_test, y_category_pred)
subcategory_accuracy = accuracy_score(y_subcategory_test, y_subcategory_pred)

In [21]:
# Print the accuracies
print(f"Category Accuracy: {category_accuracy:.2f}")
print(f"Subcategory Accuracy: {subcategory_accuracy:.2f}")

# Optionally, you can print a classification report for more detailed metrics
print("\nCategory Classification Report:")
print(classification_report(y_category_test, y_category_pred))

print("\nSubcategory Classification Report:")
print(classification_report(y_subcategory_test, y_subcategory_pred))

Category Accuracy: 0.28
Subcategory Accuracy: 0.14

Category Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.08      0.15        12
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         1
           7       0.25      0.27      0.26        11
           8       0.27      0.71      0.39        14

    accuracy                           0.28        50
   macro avg       0.25      0.18      0.13        50
weighted avg       0.37      0.28      0.20        50


Subcategory Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         1
           9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
