In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('brown')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from collections import Counter
import string
string.punctuation
import re
from textblob import TextBlob
from gensim.models import Word2Vec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/bank_reviews3.csv', encoding='ISO-8859-1')
df.head()

### DATA PREPROCESSING

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df = df.drop(columns=['author','bank_image','review_title_by_user','rating_title_by_user'])

In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
df.dtypes

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df["review"].duplicated().sum()

In [None]:
df.columns = ['Date','Address','Bank','Rating', 'Review','Useful_Count']
df.head()

In [None]:
def rating_to_sentiment(r):
    if r >= 4:
        return 1
    elif r == 3:
        return 0
    else:
        return -1

df['Polarity'] = df['Rating'].apply(rating_to_sentiment)

In [None]:
value_counts=df.Polarity.value_counts()
value_counts

In [None]:
value_counts.plot(kind="bar", colormap='viridis')
plt.xlabel("Polarity")
plt.ylabel("Frequency")
plt.title("Polarity Distribution")
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df.reset_index(drop=True, inplace=True)
df.info()

df_og = df.copy()

### DATA CLEANING

In [None]:
def lower(text):
    words = text.split()
    lower = [word.lower() for word in words]
    return ' '.join(lower)

df['Review']= df['Review'].apply(lambda x:lower(x))

print("Converted all the reviews to lower case ")
df.head()

In [None]:
link_removed = 0

def hyperlinks(text):
    global link_removed
    pattern = r'http\S+|www\S+'
    matches = re.findall(pattern , text)
    link_removed += len(matches)
    removed = re.sub(pattern, '', text)
    return removed

df['Review']= df['Review'].apply(lambda x:hyperlinks(x))
print(f"successfully removed {link_removed} hyperlinks")

In [None]:
large_spaces = 0
def remove_large_spaces(text):
    global large_spaces
    pattern = r'\s{2,}'
    matches = re.findall(pattern, text)
    large_spaces += len(matches)
    removed_spaces = re.sub(pattern, ' ', text)
    return removed_spaces.strip()
df['Review']= df['Review'].apply(lambda x:remove_large_spaces(x))
print(f"successfully removed {large_spaces} large spaces")

In [None]:
stopword = set(nltk.corpus.stopwords.words('english'))
print("total stopwords : ", len(stopword))
print(stopword)

In [None]:
stopword_count = 0

def remove_stopwords(text):
    global stopword_count
    words = text.split()
    filtered_words = [word for word in words if word not in stopword]
    stopword_count += len(words) - len(filtered_words)
    return ' '.join(filtered_words)

df['Review'] = df['Review'].astype(str).apply(remove_stopwords)

print(f"Successfully removed {stopword_count} stopwords")

In [None]:
punctuation_count = 0

def remove_punctuation(text):
    global punctuation_count
    punctuationfree = []
    for char in text:
        if char in string.punctuation:
            punctuation_count += 1
        else:
            punctuationfree.append(char)
    return ''.join(punctuationfree)


df['Review'] = df['Review'].astype(str).apply(remove_punctuation)
print(f"Successfully removed {punctuation_count} punctuation")

In [None]:
import re
number_count = 0

def remove_numbers(text):
    global number_count
    matches = re.findall(r'\d+', text)
    number_count += sum(len(match) for match in matches)
    removed_numbers = re.sub(r'\d+', '', text)
    return removed_numbers

df['Review'] = df['Review'].astype(str).apply(remove_numbers)

print(f"Successfully removed numbers. Total digits removed: {number_count}")

In [None]:
def remove_html(text):
    html_re = re.compile(r'<.*?>')
    text = re.sub(html_re, '', text)
    return text
df['Review']= df['Review'].apply(lambda x:remove_html(x))
print("successfully removed html tags")

In [None]:
date_count = 0
time_count = 0

def remove_date_time(text):
    global date_count, time_count

    # Match MM/DD/YYYY or MM/DD/YY
    date_pattern = r"\b\d{1,2}/\d{1,2}/\d{2,4}\b"
    # Match HH:MM or HH:MMAM / HH:MMPM (optional AM/PM)
    time_pattern = r"\b\d{1,2}:\d{2}(?:[AP]M)?\b"

    dates = re.findall(date_pattern, text)
    times = re.findall(time_pattern, text)
    date_count += len(dates)
    time_count += len(times)

    text_without_date = re.sub(date_pattern, '', text)
    text_without_date_time = re.sub(time_pattern, '', text_without_date)
    return text_without_date_time.strip()

df['Review'] = df['Review'].astype(str).apply(remove_date_time)

print(f"Successfully removed date and time.")
print(f"Total date patterns removed: {date_count}")
print(f"Total time patterns removed: {time_count}")

In [None]:
mention_count = 0
hashtag_count = 0

def remove_mentions_hashtags(text):
    global mention_count, hashtag_count

    mentions = re.findall(r"@\w+", text)
    mention_count += len(mentions)
    text = re.sub(r"@\w+", "", text)
    hashtags = re.findall(r"#\w+", text)
    hashtag_count += len(hashtags)
    text = re.sub(r"#\w+", "", text)
    return text.strip()

df['Review'] = df['Review'].astype(str).apply(remove_mentions_hashtags)

print("Successfully removed mentions and hashtags.")
print(f"Mentions removed: {mention_count}")
print(f"Hashtags removed: {hashtag_count}")

In [None]:
stemmed_word_changes = 0

def stem_text(text):
    global stemmed_word_changes
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = []

    for word in words:
        stemmed_word = stemmer.stem(word)
        if stemmed_word != word:
            stemmed_word_changes += 1
        stemmed_words.append(stemmed_word)

    return ' '.join(stemmed_words)

df['Review'] = df['Review'].astype(str).apply(stem_text)

print("Successfully stemmed the text.")
print(f"Total words changed after stemming: {stemmed_word_changes}")

In [None]:
def tokenize_data(data):
    tokens = word_tokenize(data)
    return tokens

df['Review']= df['Review'].apply(lambda x:tokenize_data(x))
print("successfully tokenized the text")

In [None]:
df

### EDA

In [None]:
# Basic text statistics
df['comment_length'] = df['Review'].dropna().astype(str).apply(len)
df['word_count'] = df['Review'].dropna().astype(str).apply(lambda x: len(x.split()))
df['unique_word_count'] = df['Review'].dropna().astype(str).apply(lambda x: len(set(x.split())))

# Histograms
plt.figure(figsize=(16, 5))

# Comment length
plt.subplot(1, 3, 1)
sns.histplot(df['comment_length'], bins=30, color='skyblue', kde=True)
plt.title('Distribution of Comment Length')
plt.xlabel('Length')
plt.ylabel('Frequency')

# Word count
plt.subplot(1, 3, 2)
sns.histplot(df['word_count'], bins=30, color='lightgreen', kde=True)
plt.title('Distribution of Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

# Unique word count
plt.subplot(1, 3, 3)
sns.histplot(df['unique_word_count'], bins=30, color='salmon', kde=True)
plt.title('Distribution of Unique Word Count')
plt.xlabel('Unique Words')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Boxplots: comment stats vs. rating
plt.figure(figsize=(18, 5))

plt.subplot(1, 3, 1)
sns.boxplot(x=df['Rating'], y=df['comment_length'], palette='Blues')
plt.title('Comment Length by Rating')
plt.xlabel('Rating')
plt.ylabel('Length')

plt.subplot(1, 3, 2)
sns.boxplot(x=df['Rating'], y=df['word_count'], palette='Greens')
plt.title('Word Count by Rating')
plt.xlabel('Rating')
plt.ylabel('Words')

plt.subplot(1, 3, 3)
sns.boxplot(x=df['Rating'], y=df['unique_word_count'], palette='Reds')
plt.title('Unique Word Count by Rating')
plt.xlabel('Rating')
plt.ylabel('Unique Words')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(6, 4))
corr = df[['comment_length', 'word_count', 'unique_word_count']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Per-rating average stats
rating_stats = df.groupby('Rating')[['comment_length', 'word_count', 'unique_word_count']].mean().round(1)
print("Average Comment Stats by Rating:")
rating_stats

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(df['Rating'], bins=10, kde=True, color='blue')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='Rating', data=df, palette='pastel')
plt.title('Number of Reviews per Rating')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.grid(axis='y')
plt.show()

In [None]:
df.columns

In [None]:
df['year_month'] = df['Date'].dt.to_period('M')

monthly_sentiment_counts = df.groupby(['year_month', 'Polarity']).size().unstack(fill_value=0)

print(monthly_sentiment_counts.tail())

plt.figure(figsize=(12, 6))
monthly_sentiment_counts.plot(kind='line', marker='o', figsize=(12, 6))

plt.title('Monthly Sentiment Trends in Customer Reviews')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.legend(title='Sentiment')
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
city_sentiment_counts = df.groupby(['Address', 'Polarity']).size().unstack(fill_value=0)

print(city_sentiment_counts)

# Stacked bar chart
city_sentiment_counts.plot(kind='bar', stacked=True, figsize=(40, 20), colormap='Set2')
plt.title('City-wise Sentiment Distribution of Reviews')
plt.xlabel('City')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
comments = df['Review'].dropna().astype(str)

text_all = " ".join(comments)

# Word Cloud for Unigrams
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords='english').generate(text_all)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Unigrams')
plt.show()

# Bar Chart - Top 20 Unigrams
vectorizer_uni = CountVectorizer(stop_words='english', max_features=20)
X_uni = vectorizer_uni.fit_transform(comments)
words = vectorizer_uni.get_feature_names_out()
counts = X_uni.sum(axis=0).A1
df_uni = pd.DataFrame({'word': words, 'frequency': counts}).sort_values(by='frequency', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=df_uni, x='frequency', y='word', palette='magma')
plt.title('Top 20 Unigrams')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.grid(True)
plt.show()

In [None]:
# Word Cloud for Bigrams
vectorizer_bi = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=100)
X_bi = vectorizer_bi.fit_transform(comments)
bigrams = vectorizer_bi.get_feature_names_out()
counts_bi = X_bi.sum(axis=0).A1
text_bigrams = dict(zip(bigrams, counts_bi))
wordcloud_bi = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(text_bigrams)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_bi, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Bigrams')
plt.show()

# Bar Chart - Top 20 Bigrams
df_bi = pd.DataFrame({'bigram': bigrams, 'frequency': counts_bi}).sort_values(by='frequency', ascending=False).head(20)

plt.figure(figsize=(10, 6))
sns.barplot(data=df_bi, x='frequency', y='bigram', palette='magma')
plt.title('Top 20 Bigrams')
plt.xlabel('Frequency')
plt.ylabel('Bigram')
plt.grid(True)
plt.show()

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['Review'].dropna().astype(str))
word_freq = X.sum(axis=0).A1
words = vectorizer.get_feature_names_out()

freq_df = pd.DataFrame({'word': words, 'frequency': word_freq})
freq_df = freq_df.sort_values(by='frequency', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=freq_df, x='frequency', y='word', palette='viridis')
plt.title('Top 20 Most Frequent Words in Comments')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()


#### Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(stop_words='english', max_features=100)
X_bow = bow_vectorizer.fit_transform(df['Review'].astype(str))

print(f"BoW Shape: {X_bow.shape}")
print("BoW - Top 10 Features:")
print(bow_vectorizer.get_feature_names_out()[:10])

#### TF-IDF vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['Review'] = df['Review'].astype(str)

# TF-IDF Vectorization with unigrams (1-grams)
vectorizer_unigram = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=100)
X_unigram = vectorizer_unigram.fit_transform(df['Review'])

print("Top 10 Unigram Features:")
print(vectorizer_unigram.get_feature_names_out()[:10])

# TF-IDF Vectorization with bigrams (2-grams)
vectorizer_bigram = TfidfVectorizer(ngram_range=(2, 2), stop_words='english', max_features=100)
X_bigram = vectorizer_bigram.fit_transform(df['Review'])

print("\nTop 10 Bigram Features:")
print(vectorizer_bigram.get_feature_names_out()[:10])

# TF-IDF Vectorization with both unigrams and bigrams (1-2 grams)
vectorizer_1_2gram = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=100)
X_1_2gram = vectorizer_1_2gram.fit_transform(df['Review'])

print(f"\nTF-IDF Shape: {X_1_2gram.shape}")
print("Top 10 1-2 Gram Features:")
print(vectorizer_1_2gram.get_feature_names_out()[:10])

#### Word2Vec Embeddings with Gensim

In [None]:
# Preprocess and tokenize the reviews
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing
tokenized_reviews = df['Review'].apply(preprocess_text)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_reviews,
                     vector_size=100,
                     window=5,
                     min_count=2,
                     workers=4,
                     sg=1)

word_vec = w2v_model.wv['account']
print("Vector for 'account':", word_vec[:10])

print("\nTop 5 words similar to 'loan':")
w2v_model.wv.most_similar('loan', topn=5)

In [None]:
def get_avg_vector(tokens, model):
    vec = np.zeros(model.vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    return vec / count if count != 0 else vec


review_vectors = tokenized_reviews.apply(lambda x: get_avg_vector(x, w2v_model))
review_vectors_matrix = np.vstack(review_vectors.values)

print("\nShape of Word2Vec review matrix:", review_vectors_matrix.shape)

#### Glove embedding

In [None]:
import gensim.downloader as api

In [None]:
glove = api.load("glove-wiki-gigaword-100")

def get_avg_glove_vector(text):
    if isinstance(text, list):
        text = ' '.join(text)
    words = word_tokenize(text.lower())
    vectors = [glove[w] for w in words if w in glove]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

embeddings = df['Review'].fillna('').apply(get_avg_glove_vector)
X_glove = np.vstack(embeddings.values)

print(f"GloVe Embedding Shape: {X_glove.shape}")


####  VADER (Valence Aware Dictionary for Sentiment Reasoning)

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

df_og['vader_sentiment'] = df_og['Review'].apply(lambda x: vader.polarity_scores(str(x))['compound'])

# Label sentiment
df_og['vader_label'] = df_og['vader_sentiment'].apply(
    lambda x: 'positive' if x > 0.05 else 'negative' if x < -0.05 else 'neutral'
)

In [None]:
print(df_og['vader_label'].value_counts())

#### TextBlob

In [None]:
from textblob import TextBlob

df_og['textblob_polarity'] = df_og['Review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df_og['textblob_subjectivity'] = df_og['Review'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

df_og['textblob_label'] = df_og['textblob_polarity'].apply(
    lambda x: 'positive' if x > 0.1 else 'negative' if x < -0.1 else 'neutral'
)

In [None]:
print(df_og['textblob_label'].value_counts())

#### spaCy (via TextBlob)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_textblob_sentiment(text):
    doc = nlp(str(text))
    blob = TextBlob(doc.text)
    return blob.sentiment.polarity

df_og['spacy_textblob_polarity'] = df_og['Review'].apply(spacy_textblob_sentiment)

df_og['spacy_label'] = df_og['spacy_textblob_polarity'].apply(
    lambda x: 'positive' if x > 0.1 else 'negative' if x < -0.1 else 'neutral'
)

In [None]:
print(df_og['spacy_label'].value_counts())

### Comparison of VADER, TextBlob and spaCy

In [None]:
sample = df_og[['Review', 'vader_label', 'textblob_label', 'spacy_label']].sample(10, random_state=42)
print("Sample Sentiment Comparison:\n")
sample

In [None]:
print("\nSentiment Distribution:")
print("\nVADER:\n", df_og['vader_label'].value_counts())
print("\nTextBlob:\n", df_og['textblob_label'].value_counts())
print("\nspaCy+TextBlob:\n", df_og['spacy_label'].value_counts())

In [None]:
def categorize_sentiment(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'


In [None]:
df_og['vader_sentiment'] = df_og['vader_sentiment'].apply(categorize_sentiment)
df_og['textblob_polarity'] = df_og['textblob_polarity'].apply(categorize_sentiment)
df_og['spacy_textblob_polarity'] = df_og['spacy_textblob_polarity'].apply(categorize_sentiment)


In [None]:
label_df = df_og[['vader_sentiment', 'textblob_polarity', 'spacy_textblob_polarity']]
label_counts = label_df.apply(pd.Series.value_counts).T.fillna(0)

label_counts.plot(kind='bar', stacked=True, figsize=(8, 5), colormap='Set2')
plt.title('Sentiment Label Distribution Across Tools')
plt.ylabel('Number of Reviews')
plt.xlabel('Sentiment Tool')
plt.legend(title='Sentiment')
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
df_og['vader_score'] = df_og['Review'].apply(lambda x: vader.polarity_scores(x)['compound'])
df_og['textblob_score'] = df_og['Review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_og['spacy_score'] = df_og['Review'].apply(lambda x: TextBlob(nlp(x).text).sentiment.polarity)

In [None]:
def get_label(score, pos_thres=0.1, neg_thres=-0.1):
    if score >= pos_thres:
        return 'positive'
    elif score <= neg_thres:
        return 'negative'
    else:
        return 'neutral'

df_og['vader_label'] = df_og['vader_score'].apply(get_label)
df_og['textblob_label'] = df_og['textblob_score'].apply(get_label)
df_og['spacy_label'] = df_og['spacy_score'].apply(get_label)

# Identify disagreement cases
df_og['disagreement'] = df_og.apply(
    lambda row: len(set([row['vader_label'], row['textblob_label'], row['spacy_label']])) > 1,
    axis=1
)

# Extract and view disagreement samples
disagreements = df_og[df_og['disagreement'] == True]
print("Number of disagreements:", len(disagreements))

sample_disagreements = disagreements[['Review', 'vader_label', 'textblob_label', 'spacy_label']].sample(5, random_state=42)
sample_disagreements



In [None]:
summary = pd.DataFrame({
    'Agree': [len(df) - len(disagreements)],
    'Disagree': [len(disagreements)]
})
print("\nAgreement Summary:")
summary

### Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import ast
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Logistic Regression and Naive Bayes

In [None]:
df['Review'] = df['Review'].apply(
    lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) and x.startswith('[') else x
)


X = df['Review']
y = df['Polarity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_tfidf, y_train)

# Train
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train_tfidf, y_train)

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)


# Predict
y_pred_log = log_reg.predict(X_test_tfidf)
y_pred_nb = nb.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))

print("\nNaive Bayes Results:")
print(classification_report(y_test, y_pred_nb))



In [None]:

# Confusion matrix for logistic regression
cm = confusion_matrix(y_test, y_pred_log)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

# Confusion matrix for naive bayes
cm = confusion_matrix(y_test, y_pred_nb)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix -Naive Bayes")
plt.show()

### LSTM

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Encode labels: -1 → 0, 0 → 1, 1 → 2
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Random oversampling on raw text
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(np.array(X).reshape(-1, 1), y_enc)
X_resampled = X_resampled.flatten()

# Train-test split
X_train, X_test, y_train_enc, y_test_enc = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# One-hot encode targets
y_train_cat = to_categorical(y_train_enc)
y_test_cat = to_categorical(y_test_enc)

# Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
max_len = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Train Word2Vec embeddings on tokenized training data
tokenized = [word_tokenize(text.lower()) for text in X_train]
w2v_model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=2, workers=4)

# Create embedding matrix
embedding_dim = 100
vocab_size = 10000
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size and word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Build model
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_len,
    trainable=True
))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

# Compile with lower learning rate
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=5e-4),
    metrics=['accuracy']
)

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_pad,
    y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_pad, y_test_cat),
    callbacks=[early_stop],
    verbose=1
)

# Evaluation
y_pred = model.predict(X_test_pad)
y_pred_labels = np.argmax(y_pred, axis=1)
y_true_labels = np.argmax(y_test_cat, axis=1)

print(classification_report(y_true_labels, y_pred_labels, target_names=le.classes_.astype(str)))


In [None]:
y_pred_prob = model.predict(X_test_pad)

# Convert probabilities to predicted classes
y_pred = np.argmax(y_pred_prob, axis=1)

# True labels (already encoded to 0, 1, 2)
y_true = y_test_enc

# Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=le.classes_.astype(str)))

# Accuracy Score
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()


### DistilBERT

In [None]:
# Encode labels from 'Polarity'
le = LabelEncoder()
df['label'] = le.fit_transform(df['Polarity'])

# Apply RandomOverSampler on raw reviews
ros = RandomOverSampler(random_state=42)
text_resampled, label_resampled = ros.fit_resample(
    np.array(df['Review']).reshape(-1, 1), df['label']
)

# Create a balanced DataFrame
df_balanced = pd.DataFrame({'Review': text_resampled.flatten(), 'label': label_resampled})


In [None]:
import transformers
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['Polarity'])

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['Review'].tolist(), df_balanced['label'].tolist(), test_size=0.2, random_state=42
)

# Load tokenizer and encode data
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_dict({**train_encodings, 'label': train_labels})
val_dataset = Dataset.from_dict({**val_encodings, 'label': val_labels})

# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate
eval_result = trainer.evaluate()
print("\nEvaluation Results:", eval_result)


In [None]:
preds_output = trainer.predict(val_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

print("DistilBERT:\n", classification_report(val_labels, preds))

In [None]:
# confusion matrix
cm = confusion_matrix(val_labels, preds)
labels = np.unique(np.concatenate([val_labels, preds]))

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix: DistilBERT")
plt.show()

### Comparison of Models

In [None]:
def get_scores(y_true, y_pred, model_name):
    report = classification_report(y_true, y_pred, output_dict=True)
    return {
        'model': model_name,
        'accuracy': report['accuracy'],
        'f1_positive': report['1']['f1-score'],
        'f1_negative': report['0']['f1-score']
    }

results = []
results.append(get_scores(y_test, y_pred_log, 'Logistic Regression'))
results.append(get_scores(y_test, y_pred_nb, 'Naive Bayes'))
results.append(get_scores(y_true_labels, y_pred_labels, 'LSTM'))
results.append(get_scores(val_labels, preds, 'DistilBERT'))

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Accuracy Comparison
plt.figure(figsize=(8, 5))
sns.barplot(x='model', y='accuracy', data=results_df, palette='viridis')
plt.title("Model Accuracy Comparison")
plt.ylim(0.5, 1.0)
plt.ylabel("Accuracy")
plt.xlabel("Model")
plt.tight_layout()
plt.show()

# F1-Score per Class
f1_df = results_df.melt(id_vars='model', value_vars=['f1_positive', 'f1_negative'], var_name='class', value_name='f1_score')
f1_df['class'] = f1_df['class'].map({'f1_positive': 'Positive', 'f1_negative': 'Negative'})

plt.figure(figsize=(9, 5))
sns.barplot(x='model', y='f1_score', hue='class', data=f1_df, palette='mako')
plt.title("F1-Score by Sentiment Class")
plt.ylim(0, 1)
plt.ylabel("F1 Score")
plt.xlabel("Model")
plt.legend(title='Class')
plt.tight_layout()
plt.show()


### Top complaint themes

In [None]:
def get_sentiment(r):
    if r >= 4:
        return 'positive'
    elif r <= 2:
        return 'negative'
    else:
        return None

df['Polarity'] = df['Rating'].apply(get_sentiment)
df = df[df['Polarity'] == 'negative']

# Clean and tokenize
def preprocess(text):
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    return [word for word in tokens if word not in stopwords.words('english') and len(word) > 2]

df['tokens'] = df['Review'].apply(preprocess)

# Flatten all tokens into a single list
all_tokens = [word for tokens in df['tokens'] for word in tokens]

# Get most common words
word_counts = Counter(all_tokens)
top_words = word_counts.most_common(20)

# Convert to DataFrame for plotting
top_df = pd.DataFrame(top_words, columns=['word', 'count'])

# Bar Plot of Top Keywords
plt.figure(figsize=(10, 6))
sns.barplot(x='count', y='word', data=top_df, palette='rocket')
plt.title('Top Complaint Keywords in Negative Reviews')
plt.xlabel('Frequency')
plt.ylabel('Keyword')
plt.tight_layout()
plt.show()

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Complaint Themes")
plt.tight_layout()
plt.show()

# Bigram Analysis
bigrams = Counter(ngrams(all_tokens, 2))
top_bigrams = bigrams.most_common(10)
print("\nTop Complaint Bigrams:")
for phrase, count in top_bigrams:
    print(f"{' '.join(phrase)}: {count}")


### Aspect Based Sentiment Analysis

In [None]:
from collections import defaultdict

def get_sentiment(r):
    if r >= 4:
        return 'positive'
    elif r <= 2:
        return 'negative'
    else:
        return None

df['Polarity'] = df['Rating'].apply(get_sentiment)

# Define aspect keywords (you can expand this)
aspect_keywords = ['service', 'loan', 'card', 'charges', 'staff', 'app', 'transaction', 'support']

# ABSA function
def extract_aspects(text):
    doc = nlp(text)
    aspects = []
    for token in doc:
        if token.text in aspect_keywords:
            # Find nearby adjective or opinion
            window = doc[max(token.i - 3, 0): min(token.i + 4, len(doc))]
            sentiment_phrase = window.text
            polarity = TextBlob(sentiment_phrase).sentiment.polarity
            label = 'positive' if polarity > 0.1 else 'negative' if polarity < -0.1 else 'neutral'
            aspects.append({'aspect': token.text, 'opinion': sentiment_phrase, 'Polarity': label})
    return aspects

# Apply to a sample (or full dataset)
sample_reviews = df.sample(20, random_state=42)['Review']
all_aspects = []

for review in sample_reviews:
    aspects = extract_aspects(review)
    for asp in aspects:
        asp['Review'] = review
        all_aspects.append(asp)

# Convert to DataFrame
aspects_df = pd.DataFrame(all_aspects)
print("Extracted Aspect-Based Sentiment:")
aspects_df[['Review', 'aspect', 'opinion', 'Polarity']]

In [None]:
# Count by aspect
print("\nAspect Sentiment Counts:")
aspects_df.groupby(['aspect', 'Polarity']).size().unstack(fill_value=0)

### Department level insights

In [None]:
department_keywords = {
    "loan process": ["loan", "loan process", "home loan", "personal loan"],
    "branch experience": ["branch", "staff", "manager", "in person"],
    "customer service": ["customer service", "support", "call", "response"],
    "account opening": ["account opening", "new account", "open account"],
    "mobile app": ["app", "application", "mobile", "online"],
    "charges/fees": ["charges", "fees", "hidden charge", "deduction"],
    "credit card": ["credit card", "card limit", "card issue"]
}

# Function to match and score sentiment
def extract_department_sentiments(text):
    sentiments = []
    for dept, keywords in department_keywords.items():
        for kw in keywords:
            if re.search(rf'\b{re.escape(kw)}\b', text):
                sentiment_score = TextBlob(text).sentiment.polarity
                label = 'positive' if sentiment_score > 0.1 else 'negative' if sentiment_score < -0.1 else 'neutral'
                sentiments.append((dept, kw, sentiment_score, label))
    return sentiments

# Apply to dataset
results = []

for _, row in df.iterrows():
    review = row['Review']
    matches = extract_department_sentiments(review)
    for dept, kw, score, label in matches:
        results.append({
            "department": dept,
            "matched_phrase": kw,
            "Review": review,
            "sentiment_score": score,
            "Polarity": label
        })

# Create DataFrame
insights_df = pd.DataFrame(results)

# View sample insights
print("Sample Department-Level Insights:")
insights_df.head(10)




In [None]:
# Sentiment counts per department
summary = insights_df.groupby(['department', 'Polarity']).size().unstack(fill_value=0)
print("\nDepartment-Level Sentiment Summary:")
summary

In [None]:
# Stacked Bar Chart: Sentiment Count per Department
summary = insights_df.groupby(['department', 'Polarity']).size().unstack(fill_value=0)
summary = summary[['positive', 'neutral', 'negative']]  # Consistent order

summary.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set2')
plt.title('Department-Level Sentiment Distribution')
plt.xlabel('Department / Service')
plt.ylabel('Number of Mentions')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()

# Pie Chart for Each Department
for dept in summary.index:
    sentiment_counts = summary.loc[dept]
    plt.figure(figsize=(5, 5))
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=['green', 'gray', 'red'])
    plt.title(f'Sentiment Distribution for "{dept}"')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
