In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r"C:\Users\kannapr1\Downloads\customer_feedback_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Review_ID,Customer_ID,Review_Text,Sentiment_Label,Rating,Service_Type
0,1287c703-a09e-40f3-b17c-92bc37b47790,2c946bca-ef75-45a7-9bde-43e7f7ab2979,The website is slow and unresponsive. I tried ...,Negative,1,Claim
1,8f879862-6cdf-48b8-8ffc-5d3cd8cf9b66,eee5da15-5e7d-4a21-9159-045b0799f7c3,Extremely dissatisfied with the level of servi...,Negative,1,Support
2,58219807-817d-4abe-ab42-26ed5610888d,9940be2e-0929-4b59-bd28-45aafae8ea26,One of the best experiences I've had with a se...,Positive,5,Claim
3,31bd9516-5a52-4f6a-8d3b-70a2f79903aa,f7e9d7be-34b1-4776-ae4d-845579ecf756,The service was neither great nor bad. I got m...,Neutral,4,Policy Purchase
4,7b34f439-42a6-4fe7-ad4e-373558000b1b,96ddd94a-9402-4b37-bd06-b04976f3a0bb,I had a terrible experience with the claims de...,Negative,1,Support


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Review_ID        5000 non-null   object
 1   Customer_ID      5000 non-null   object
 2   Review_Text      5000 non-null   object
 3   Sentiment_Label  5000 non-null   object
 4   Rating           5000 non-null   int64 
 5   Service_Type     5000 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [9]:
df['Sentiment_Label'].unique()

array(['Negative', 'Positive', 'Neutral'], dtype=object)

In [11]:
df['Rating'].unique()

array([1, 5, 4, 2, 3], dtype=int64)

In [21]:
df['Service_Type'].unique()

array([0, 2, 1])

In [19]:
le = LabelEncoder()
df['Service_Type'] = le.fit_transform(df['Service_Type'])

In [23]:
df['Review_Text']

0       The website is slow and unresponsive. I tried ...
1       Extremely dissatisfied with the level of servi...
2       One of the best experiences I've had with a se...
3       The service was neither great nor bad. I got m...
4       I had a terrible experience with the claims de...
                              ...                        
4995    The service met my expectations, nothing more,...
4996    Had an average experience with the claims proc...
4997    One of the best experiences I've had with a se...
4998    Excellent service! The team was very professio...
4999    The service met my expectations, nothing more,...
Name: Review_Text, Length: 5000, dtype: object

In [25]:
df.isnull().sum()

Review_ID          0
Customer_ID        0
Review_Text        0
Sentiment_Label    0
Rating             0
Service_Type       0
dtype: int64

In [27]:
print("Text Cleaning & Tokenization")
print("Remove stopwords, punctuation, and special characters.")

Text Cleaning & Tokenization
Remove stopwords, punctuation, and special characters.


In [29]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [30]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kannapr1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kannapr1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kannapr1\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [35]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [37]:
df['step1'] = df['Review_Text'].apply(clean_text)

In [39]:
df['step1']

0       the website is slow and unresponsive i tried t...
1       extremely dissatisfied with the level of servi...
2       one of the best experiences ive had with a ser...
3       the service was neither great nor bad i got my...
4       i had a terrible experience with the claims de...
                              ...                        
4995    the service met my expectations nothing more n...
4996    had an average experience with the claims proc...
4997    one of the best experiences ive had with a ser...
4998    excellent service the team was very profession...
4999    the service met my expectations nothing more n...
Name: step1, Length: 5000, dtype: object

In [41]:
df[['Review_Text', 'step1']]

Unnamed: 0,Review_Text,step1
0,The website is slow and unresponsive. I tried ...,the website is slow and unresponsive i tried t...
1,Extremely dissatisfied with the level of servi...,extremely dissatisfied with the level of servi...
2,One of the best experiences I've had with a se...,one of the best experiences ive had with a ser...
3,The service was neither great nor bad. I got m...,the service was neither great nor bad i got my...
4,I had a terrible experience with the claims de...,i had a terrible experience with the claims de...
...,...,...
4995,"The service met my expectations, nothing more,...",the service met my expectations nothing more n...
4996,Had an average experience with the claims proc...,had an average experience with the claims proc...
4997,One of the best experiences I've had with a se...,one of the best experiences ive had with a ser...
4998,Excellent service! The team was very professio...,excellent service the team was very profession...


In [43]:
def token_and_lemma(text):
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatize & remove stopwords
    return tokens

In [45]:
df['step2'] = df['step1'].apply(token_and_lemma)

In [47]:
df[['Review_Text', 'step1', 'step2']]

Unnamed: 0,Review_Text,step1,step2
0,The website is slow and unresponsive. I tried ...,the website is slow and unresponsive i tried t...,"[website, slow, unresponsive, tried, purchase,..."
1,Extremely dissatisfied with the level of servi...,extremely dissatisfied with the level of servi...,"[extremely, dissatisfied, level, service, prov..."
2,One of the best experiences I've had with a se...,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider..."
3,The service was neither great nor bad. I got m...,the service was neither great nor bad i got my...,"[service, neither, great, bad, got, policy, ti..."
4,I had a terrible experience with the claims de...,i had a terrible experience with the claims de...,"[terrible, experience, claim, department, kept..."
...,...,...,...
4995,"The service met my expectations, nothing more,...",the service met my expectations nothing more n...,"[service, met, expectation, nothing, nothing, ..."
4996,Had an average experience with the claims proc...,had an average experience with the claims proc...,"[average, experience, claim, process, wasnt, d..."
4997,One of the best experiences I've had with a se...,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider..."
4998,Excellent service! The team was very professio...,excellent service the team was very profession...,"[excellent, service, team, professional, helpf..."


In [49]:
def sentiment_Label(Rating):
    if Rating >=4:
        return 'Positive'
    elif Rating == 3:
        return 'Neutral'
    else:
        return "Negative"
    

In [51]:
df['Sentiment_Label'] = df['Rating'].apply(sentiment_Label)

In [53]:
df

Unnamed: 0,Review_ID,Customer_ID,Review_Text,Sentiment_Label,Rating,Service_Type,step1,step2
0,1287c703-a09e-40f3-b17c-92bc37b47790,2c946bca-ef75-45a7-9bde-43e7f7ab2979,The website is slow and unresponsive. I tried ...,Negative,1,0,the website is slow and unresponsive i tried t...,"[website, slow, unresponsive, tried, purchase,..."
1,8f879862-6cdf-48b8-8ffc-5d3cd8cf9b66,eee5da15-5e7d-4a21-9159-045b0799f7c3,Extremely dissatisfied with the level of servi...,Negative,1,2,extremely dissatisfied with the level of servi...,"[extremely, dissatisfied, level, service, prov..."
2,58219807-817d-4abe-ab42-26ed5610888d,9940be2e-0929-4b59-bd28-45aafae8ea26,One of the best experiences I've had with a se...,Positive,5,0,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider..."
3,31bd9516-5a52-4f6a-8d3b-70a2f79903aa,f7e9d7be-34b1-4776-ae4d-845579ecf756,The service was neither great nor bad. I got m...,Positive,4,1,the service was neither great nor bad i got my...,"[service, neither, great, bad, got, policy, ti..."
4,7b34f439-42a6-4fe7-ad4e-373558000b1b,96ddd94a-9402-4b37-bd06-b04976f3a0bb,I had a terrible experience with the claims de...,Negative,1,2,i had a terrible experience with the claims de...,"[terrible, experience, claim, department, kept..."
...,...,...,...,...,...,...,...,...
4995,4008f56f-b790-4df6-8488-3d6d35d9b8d9,fd9858f1-c01c-4e2c-b8c9-13eb816f4bfc,"The service met my expectations, nothing more,...",Positive,4,2,the service met my expectations nothing more n...,"[service, met, expectation, nothing, nothing, ..."
4996,b3096e1a-f36d-4a00-8e77-c4972af791b2,6d6196fb-e167-4b53-b58e-761e82ccf72f,Had an average experience with the claims proc...,Negative,2,2,had an average experience with the claims proc...,"[average, experience, claim, process, wasnt, d..."
4997,7d5c898b-0765-4eb3-af7d-5e3927972ad0,a44cdb6e-4edf-4eae-a71b-13c2a3782795,One of the best experiences I've had with a se...,Positive,4,2,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider..."
4998,ed4260fa-9510-455e-b6f3-5775f86bfc20,547f0c00-1d72-41fe-933a-7f3b1b40bcb8,Excellent service! The team was very professio...,Positive,4,1,excellent service the team was very profession...,"[excellent, service, team, professional, helpf..."


In [55]:
print ("Use Word2Vec to convert text into numerical vectors")
print ("Will use word2Vector")

Use Word2Vec to convert text into numerical vectors
Will use word2Vector


In [57]:
from numpy import triu
import gensim


In [58]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

In [61]:
tokenized_corpus = [word_tokenize(doc.lower()) for doc in df['Review_Text']]

In [63]:
tokenized_corpus

[['the',
  'website',
  'is',
  'slow',
  'and',
  'unresponsive',
  '.',
  'i',
  'tried',
  'to',
  'purchase',
  'a',
  'policy',
  ',',
  'but',
  'the',
  'payment',
  'gateway',
  'kept',
  'failing',
  '.',
  'had',
  'to',
  'contact',
  'support',
  ',',
  'and',
  'even',
  'that',
  'was',
  'not',
  'very',
  'helpful',
  '.'],
 ['extremely',
  'dissatisfied',
  'with',
  'the',
  'level',
  'of',
  'service',
  'provided',
  '.',
  'i',
  'have',
  'been',
  'a',
  'loyal',
  'customer',
  'for',
  'years',
  ',',
  'but',
  'my',
  'recent',
  'experience',
  'was',
  'very',
  'disappointing',
  '.',
  'i',
  'expected',
  'better',
  'treatment',
  'and',
  'quicker',
  'responses',
  '.'],
 ['one',
  'of',
  'the',
  'best',
  'experiences',
  'i',
  "'ve",
  'had',
  'with',
  'a',
  'service',
  'provider',
  '.',
  'from',
  'policy',
  'purchase',
  'to',
  'claim',
  'settlement',
  ',',
  'everything',
  'was',
  'smooth',
  'and',
  'hassle-free',
  '.',
  'woul

In [65]:
word2vec_model = Word2Vec(sentences = tokenized_corpus, vector_size = 100, window=5, min_count=1, workers=4)
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x2050fc73bf0>

In [67]:
wordvector = word2vec_model.wv["service"]
print("Vector for service: ", wordvector[:10])

Vector for service:  [ 0.25564018  0.05064033 -0.6444033  -0.54851437 -0.82526904 -0.64856803
 -0.03634242 -0.04280666 -0.51930887 -0.28824455]


In [69]:
def get_embedding(sentence, model):
    words = word_tokenize(sentence.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * 100 

In [71]:
word2vec_embeddings = [get_embedding(text, word2vec_model) for text in df['Review_Text']]

In [77]:
word2vec_embeddings

[array([ 8.43204483e-02,  2.13390261e-01,  5.24630308e-01, -3.61380041e-01,
        -1.74967032e-02, -5.48303664e-01, -1.05903029e-01,  7.44951606e-01,
         4.84712988e-01, -5.66432737e-02, -7.66577944e-02, -2.66219079e-01,
        -8.39420930e-02,  5.22255599e-01,  4.78924625e-02,  1.03556961e-01,
        -1.45529762e-01,  1.06202643e-02,  3.29633392e-02, -3.13248247e-01,
         3.31617177e-01, -1.10618822e-01, -2.15683952e-01, -1.60914704e-01,
        -3.78669471e-01,  5.49762666e-01, -1.44779667e-01,  2.60480165e-01,
        -1.69071048e-01,  2.07620233e-01,  3.84984195e-01, -1.84583500e-01,
        -2.95781910e-01, -2.56449193e-01, -3.62851731e-02, -1.08393719e-02,
         2.68659413e-01, -3.80471796e-01,  1.62673488e-01, -3.52729917e-01,
        -3.34206760e-01,  4.42666560e-02,  1.02500260e-01, -3.24853331e-01,
         1.95077751e-02, -1.78579807e-01, -9.86076072e-02,  1.92842916e-01,
         4.72949862e-01,  3.12318563e-01, -3.35404724e-01,  5.04580624e-02,
         3.5

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Review_ID         5000 non-null   object
 1   Customer_ID       5000 non-null   object
 2   Review_Text       5000 non-null   object
 3   Sentiment_Label   5000 non-null   object
 4   Rating            5000 non-null   int64 
 5   Service_Type      5000 non-null   int32 
 6   step1             5000 non-null   object
 7   step2             5000 non-null   object
 8   VectorizedReview  5000 non-null   object
dtypes: int32(1), int64(1), object(7)
memory usage: 332.2+ KB


In [81]:
df['VectorizedReview'] = word2vec_embeddings

In [85]:
df = df.drop(columns=['Review_ID', 'Customer_ID'])

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Review_Text       5000 non-null   object
 1   Sentiment_Label   5000 non-null   object
 2   Rating            5000 non-null   int64 
 3   Service_Type      5000 non-null   int32 
 4   step1             5000 non-null   object
 5   step2             5000 non-null   object
 6   VectorizedReview  5000 non-null   object
dtypes: int32(1), int64(1), object(5)
memory usage: 254.0+ KB


In [89]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [90]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from collections import Counter

In [93]:
df.head()

Unnamed: 0,Review_Text,Sentiment_Label,Rating,Service_Type,step1,step2,VectorizedReview
0,The website is slow and unresponsive. I tried ...,Negative,1,0,the website is slow and unresponsive i tried t...,"[website, slow, unresponsive, tried, purchase,...","[0.08432045, 0.21339026, 0.5246303, -0.3613800..."
1,Extremely dissatisfied with the level of servi...,Negative,1,2,extremely dissatisfied with the level of servi...,"[extremely, dissatisfied, level, service, prov...","[-0.016033683, -0.048339814, 0.12488672, 0.282..."
2,One of the best experiences I've had with a se...,Positive,5,0,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider...","[0.0577974, 0.17601965, -0.14766783, -0.312233..."
3,The service was neither great nor bad. I got m...,Positive,4,1,the service was neither great nor bad i got my...,"[service, neither, great, bad, got, policy, ti...","[-0.24666514, -0.30535468, 0.34272832, 0.20219..."
4,I had a terrible experience with the claims de...,Negative,1,2,i had a terrible experience with the claims de...,"[terrible, experience, claim, department, kept...","[0.1829823, 0.17531078, 0.75122577, -0.4905853..."


In [95]:
df[df['Sentiment_Label']== 'Positive']

Unnamed: 0,Review_Text,Sentiment_Label,Rating,Service_Type,step1,step2,VectorizedReview
2,One of the best experiences I've had with a se...,Positive,5,0,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider...","[0.0577974, 0.17601965, -0.14766783, -0.312233..."
3,The service was neither great nor bad. I got m...,Positive,4,1,the service was neither great nor bad i got my...,"[service, neither, great, bad, got, policy, ti...","[-0.24666514, -0.30535468, 0.34272832, 0.20219..."
10,One of the best experiences I've had with a se...,Positive,5,1,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider...","[0.0577974, 0.17601965, -0.14766783, -0.312233..."
11,"I purchased a policy online, and the entire pr...",Positive,4,2,i purchased a policy online and the entire pro...,"[purchased, policy, online, entire, process, s...","[-0.10808939, 0.23654623, 0.030341847, -0.1309..."
13,The claims process was smooth and efficient. I...,Positive,4,1,the claims process was smooth and efficient i ...,"[claim, process, smooth, efficient, pleasantly...","[0.26609713, -0.24362099, -0.171367, 0.1747370..."
...,...,...,...,...,...,...,...
4993,"I purchased a policy online, and the entire pr...",Positive,5,0,i purchased a policy online and the entire pro...,"[purchased, policy, online, entire, process, s...","[-0.10808939, 0.23654623, 0.030341847, -0.1309..."
4995,"The service met my expectations, nothing more,...",Positive,4,2,the service met my expectations nothing more n...,"[service, met, expectation, nothing, nothing, ...","[-0.25694117, 0.28128642, 0.5980729, 0.4273995..."
4997,One of the best experiences I've had with a se...,Positive,4,2,one of the best experiences ive had with a ser...,"[one, best, experience, ive, service, provider...","[0.0577974, 0.17601965, -0.14766783, -0.312233..."
4998,Excellent service! The team was very professio...,Positive,4,1,excellent service the team was very profession...,"[excellent, service, team, professional, helpf...","[-0.036330473, -0.109224394, -0.15881294, -0.0..."


In [101]:
Features = df[['Service_Type', 'VectorizedReview']]
Features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Service_Type      5000 non-null   int32 
 1   VectorizedReview  5000 non-null   object
dtypes: int32(1), object(1)
memory usage: 58.7+ KB


In [103]:
Target = df[['Rating']]

In [105]:
X_train, X_test, Y_train, Y_test = train_test_split(Features, Target, test_size=0.3, random_state=42, stratify=Target)

In [107]:
X_train.head(), X_test.head(), Y_train.head(), Y_test.head() 

(      Service_Type                                   VectorizedReview
 4745             1  [-0.026443155, -0.1641816, 0.26976407, -0.1377...
 1725             1  [-0.026443155, -0.1641816, 0.26976407, -0.1377...
 2070             0  [-0.19099526, 0.05414225, 0.058770563, -0.6924...
 4273             1  [0.08432045, 0.21339026, 0.5246303, -0.3613800...
 2124             0  [-0.49776027, 0.36703467, 0.48318094, 0.265985...,
       Service_Type                                   VectorizedReview
 2498             0  [-0.036330473, -0.109224394, -0.15881294, -0.0...
 4424             2  [0.0577974, 0.17601965, -0.14766783, -0.312233...
 814              1  [0.029032897, 0.27390227, -0.37853038, -0.4087...
 2256             0  [-0.15925379, -0.024809929, 0.3764363, -0.3142...
 53               0  [-0.016033683, -0.048339814, 0.12488672, 0.282...,
       Rating
 4745       2
 1725       1
 2070       1
 4273       2
 2124       2,
       Rating
 2498       5
 4424       4
 814        5
 2256

In [109]:
print("class distribution before smote")
Counter(Y_train)

class distribution before smote


Counter({'Rating': 1})

In [111]:
smote = SMOTE(random_state=42)

In [113]:
smote

In [119]:
print("class distribution after smote")
Counter(Y_train)

class distribution after smote


Counter({'Rating': 1})

In [117]:
X_train.info(), Y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3500 entries, 4745 to 3968
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Service_Type      3500 non-null   int32 
 1   VectorizedReview  3500 non-null   object
dtypes: int32(1), object(1)
memory usage: 68.4+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 3500 entries, 4745 to 3968
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Rating  3500 non-null   int64
dtypes: int64(1)
memory usage: 54.7 KB


(None, None)