In [None]:
# Import necessary libraries
import pandas as pd
import spacy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, LatentDirichletAllocation, NMF
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from wordcloud import WordCloud

# Load dataset
file_path = r"C:\projects\rows.csv"
df = pd.read_csv(file_path, low_memory=False)

# Display dataset info & first few rows (ensures this is output first)
print("\n🔹 Dataset Overview:")
df.info()
print("\n🔹 Sample Data:")
display(df.head())  # Ensures this appears in Jupyter Notebook

# Handle missing values
df = df.dropna(subset=["Consumer complaint narrative"])
df.fillna({"Company public response": "No response", 
           "Tags": "No tags", 
           "Consumer consent provided?": "Unknown"}, inplace=True)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function
def preprocess_text(text):
    if pd.isnull(text):

        return ""
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

# Apply preprocessing
df_sample = df.sample(5000, random_state=42)
df_sample["cleaned_text"] = df_sample["Consumer complaint narrative"].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample["cleaned_text"].dropna())

# SpaCy Embeddings
nlp_md = spacy.load("en_core_web_md")

def get_spacy_embedding(text):
    doc = nlp_md(text)
    return doc.vector

df_sample["spacy_embedding"] = df_sample["cleaned_text"].dropna().apply(get_spacy_embedding)
spacy_embeddings = np.vstack(df_sample["spacy_embedding"].dropna().values)

# Dimensionality Reduction (PCA)
pca_tfidf = PCA(n_components=2)
tfidf_2d = pca_tfidf.fit_transform(tfidf_matrix.toarray())

pca_spacy = PCA(n_components=2)
spacy_2d = pca_spacy.fit_transform(spacy_embeddings)

# Visualize TF-IDF & spaCy Embeddings
plt.figure(figsize=(8, 6))
plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.5)
plt.title("TF-IDF Embeddings (PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(spacy_2d[:, 0], spacy_2d[:, 1], alpha=0.5, color='red')
plt.title("spaCy Embeddings (PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Cosine Similarity (TF-IDF)
tfidf_cosine_sim = cosine_similarity(tfidf_matrix)
np.fill_diagonal(tfidf_cosine_sim, 0)
most_similar_idx = np.unravel_index(np.argmax(tfidf_cosine_sim), tfidf_cosine_sim.shape)

print(f"Most similar complaints (TF-IDF): {most_similar_idx}")
print(f"Complaint 1: {df_sample.iloc[most_similar_idx[0]]['cleaned_text']}")
print(f"Complaint 2: {df_sample.iloc[most_similar_idx[1]]['cleaned_text']}")

# Clustering with K-Means
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_sample["cluster"] = kmeans.fit_predict(spacy_embeddings)

# Extract Top Words Per Cluster
clusters = df_sample["cluster"].values
cluster_tfidf = np.zeros((num_clusters, tfidf_matrix.shape[1]))

for i in range(num_clusters):
    cluster_tfidf[i] = tfidf_matrix[clusters == i].mean(axis=0)

top_n = 10
feature_names = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    top_words_idx = cluster_tfidf[i].argsort()[-top_n:][::-1]
    top_words = [feature_names[j] for j in top_words_idx]
    print(f"Cluster {i} top words: {', '.join(top_words)}")

# Topic Modeling (LDA)
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count_matrix = count_vectorizer.fit_transform(df_sample["cleaned_text"])

num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(count_matrix)

lda_feature_names = count_vectorizer.get_feature_names_out()

# Topic Modeling (NMF)
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(tfidf_matrix)

nmf_feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics(model, feature_names, num_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

print("\n🔹 Topics extracted using LDA:")
display_topics(lda, lda_feature_names)

print("\n🔹 Topics extracted using NMF:")
display_topics(nmf, nmf_feature_names)

# LDA Visualization
dictionary = corpora.Dictionary(df_sample["cleaned_text"].str.split())
corpus = [dictionary.doc2bow(text) for text in df_sample["cleaned_text"].str.split()]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    per_word_topics=True
)

lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

