In [None]:
import pandas as pd

file_path = r"C:\projects\rows.csv"

df = pd.read_csv(file_path, low_memory=False)

df.info()  # Check data types
df.head()  # View sample data


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Hello, how are you?")
tokens = [token.text for token in doc]

print(tokens)


In [None]:
import pandas as pd

file_path = r"C:\projects\rows.csv"

df = pd.read_csv(file_path, low_memory=False)

df.info()  # Check data types
df.head()  # View sample data


In [None]:
df.isnull().sum()


In [None]:
df = df.dropna(subset=["Consumer complaint narrative"])


In [None]:
df.fillna({"Company public response": "No response", 
           "Tags": "No tags", 
           "Consumer consent provided?": "Unknown"}, inplace=True)


In [None]:
df.isnull().sum()


In [None]:
import spacy
import pandas as pd
import string

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [None]:
def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""

    doc = nlp(text.lower())  # Convert to lowercase & process with spaCy
    
    # Tokenization, removing stopwords & punctuation, and lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    return " ".join(tokens)  # Join tokens back into a string


In [None]:
import pandas as pd

file_path = r"C:\projects\rows.csv"  # Ensure the correct path
df = pd.read_csv(file_path, low_memory=False)

df.info()  # Check if the dataset is loaded correctly
df.head()  # Display a few rows


In [None]:
df_sample = df.sample(5000, random_state=42)  # Load a sample of 5000 rows
df_sample["cleaned_text"] = df_sample["Consumer complaint narrative"].apply(preprocess_text)
df_sample.head()


In [None]:
df_sample.columns


In [None]:
df_sample[["Consumer complaint narrative", "cleaned_text"]].head(10)


In [None]:
df_sample["cleaned_text"].isnull().sum()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit vocabulary size
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample["cleaned_text"].dropna())

# Convert to DataFrame for better visualization
import pandas as pd
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display first few rows
tfidf_df.head()


In [None]:
import spacy
import numpy as np

# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_md")

# Function to get document vector
def get_spacy_embedding(text):
    doc = nlp(text)
    return doc.vector

# Apply function to cleaned text
df_sample["spacy_embedding"] = df_sample["cleaned_text"].dropna().apply(get_spacy_embedding)

# Convert embeddings to a numpy array for further analysis
spacy_embeddings = np.vstack(df_sample["spacy_embedding"].dropna().values)

# Display shape of the embeddings
spacy_embeddings.shape  # Should be (num_samples, 300) for spaCy's 300-dimensional vectors


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce TF-IDF embeddings to 2D
pca_tfidf = PCA(n_components=2)
tfidf_2d = pca_tfidf.fit_transform(tfidf_matrix.toarray())

# Reduce spaCy embeddings to 2D
pca_spacy = PCA(n_components=2)
spacy_2d = pca_spacy.fit_transform(spacy_embeddings)

# Plot TF-IDF embeddings
plt.figure(figsize=(8, 6))
plt.scatter(tfidf_2d[:, 0], tfidf_2d[:, 1], alpha=0.5)
plt.title("TF-IDF Embeddings Visualized using PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Plot spaCy embeddings
plt.figure(figsize=(8, 6))
plt.scatter(spacy_2d[:, 0], spacy_2d[:, 1], alpha=0.5, color='red')
plt.title("spaCy Embeddings Visualized using PCA")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity for TF-IDF vectors
tfidf_cosine_sim = cosine_similarity(tfidf_matrix)

# Find the most similar complaints (excluding self-similarity)
import numpy as np
np.fill_diagonal(tfidf_cosine_sim, 0)  # Remove self-similarity

# Get the most similar pair
most_similar_idx = np.unravel_index(np.argmax(tfidf_cosine_sim), tfidf_cosine_sim.shape)
print(f"Most similar complaints: {most_similar_idx}")

# Display the actual complaints
print("Complaint 1:", df_sample.iloc[most_similar_idx[0]]["cleaned_text"])
print("Complaint 2:", df_sample.iloc[most_similar_idx[1]]["cleaned_text"])


In [None]:
# Get feature names and IDF scores
feature_names = tfidf_vectorizer.get_feature_names_out()
idf_scores = tfidf_vectorizer.idf_

# Sort features by importance
important_words = sorted(zip(feature_names, idf_scores), key=lambda x: x[1], reverse=True)

# Display the top 10 most important words
print("Top 10 important words in TF-IDF:")
for word, score in important_words[:10]:
    print(f"{word}: {score:.2f}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Reduce dimensionality of spaCy embeddings to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
spacy_embeddings_2d = tsne.fit_transform(spacy_embeddings)

# Plot the reduced embeddings
plt.figure(figsize=(10, 6))
plt.scatter(spacy_embeddings_2d[:, 0], spacy_embeddings_2d[:, 1], alpha=0.5)
plt.title("t-SNE Visualization of spaCy Embeddings")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity for spaCy embeddings
spacy_cosine_sim = cosine_similarity(spacy_embeddings)

# Find the most similar complaints (excluding self-similarity)
np.fill_diagonal(spacy_cosine_sim, 0)  # Remove self-similarity
most_similar_idx = np.unravel_index(np.argmax(spacy_cosine_sim), spacy_cosine_sim.shape)

# Display the most similar complaints
print(f"Most similar complaints (spaCy embeddings): {most_similar_idx}")
print(f"Complaint 1: {df_sample.iloc[most_similar_idx[0]]['cleaned_text']}")
print(f"Complaint 2: {df_sample.iloc[most_similar_idx[1]]['cleaned_text']}")


In [None]:
from sklearn.cluster import KMeans

# Set number of clusters (adjust as needed)
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_sample["cluster"] = kmeans.fit_predict(spacy_embeddings)

# Print cluster assignments
df_sample[["cleaned_text", "cluster"]].head(10)


In [None]:
import numpy as np

# Get cluster labels
clusters = df_sample["cluster"].values

# Compute average TF-IDF scores per cluster
cluster_tfidf = np.zeros((num_clusters, tfidf_matrix.shape[1]))

for i in range(num_clusters):
    cluster_tfidf[i] = tfidf_matrix[clusters == i].mean(axis=0)

# Get top words for each cluster
top_n = 10  # Number of top words to display per cluster
feature_names = tfidf_vectorizer.get_feature_names_out()

for i in range(num_clusters):
    top_words_idx = cluster_tfidf[i].argsort()[-top_n:][::-1]
    top_words = [feature_names[j] for j in top_words_idx]
    print(f"Cluster {i} top words: {', '.join(top_words)}")


In [None]:
for i in range(num_clusters):
    sample_text = df_sample[df_sample["cluster"] == i]["cleaned_text"].iloc[0]
    print(f"Cluster {i} Example: {sample_text}\n")


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Use CountVectorizer for LDA (since it works better with raw word counts)
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
count_matrix = count_vectorizer.fit_transform(df_sample["cleaned_text"])  

# Use TfidfVectorizer for NMF (since it works better with weighted word frequencies)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample["cleaned_text"])  


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Set the number of topics
num_topics = 5  

# Train LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(count_matrix)

# Get feature names
lda_feature_names = count_vectorizer.get_feature_names_out()


In [None]:
from sklearn.decomposition import NMF

# Train NMF model
nmf = NMF(n_components=num_topics, random_state=42)
nmf.fit(tfidf_matrix)

# Get feature names
nmf_feature_names = tfidf_vectorizer.get_feature_names_out()


In [None]:
def display_topics(model, feature_names, num_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))

print("\n🔹 Topics extracted using LDA:")
display_topics(lda, lda_feature_names)

print("\n🔹 Topics extracted using NMF:")
display_topics(nmf, nmf_feature_names)


In [None]:
import pyLDAvis
print("pyLDAvis installed successfully!")


In [None]:
import gensim
import numpy
print("Gensim & NumPy are working!")


In [None]:
# Re-import necessary libraries
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer

# Check if lda_model exists
try:
    print(lda_model)
except NameError:
    print("⚠️ 'lda_model' is not defined! Re-run the topic modeling code.")


In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample: Load your text dataset (replace with actual dataset)
text_data = [
    "The bank approved my mortgage, but the interest rate was too high.",
    "I had issues with my credit card payment and customer service was unhelpful."
]

# Process text: Tokenization, lemmatization, and stopword removal
processed_texts = []
for doc in nlp.pipe(text_data):
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    processed_texts.append(tokens)

print("Processed Texts:", processed_texts)  # Check the output


In [None]:
import gensim
import gensim.corpora as corpora

# Create dictionary
dictionary = corpora.Dictionary(processed_texts)

# Convert to bag-of-words corpus
corpus = [dictionary.doc2bow(text) for text in processed_texts]

print("✅ Dictionary & Corpus created successfully!")


In [None]:
print("Sample Processed Texts:", processed_texts[:2])  # Show first 2 entries
print("Dictionary:", dictionary.token2id)  # Show dictionary mapping
print("First Corpus Entry:", corpus[0])  # Show first document in BoW format


In [None]:
from gensim.models import LdaModel

# Set number of topics (adjust as needed)
num_topics = 5

# Train the LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    per_word_topics=True
)

print("✅ LDA Model trained successfully!")


In [None]:
# Print the topics
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"🔹 Topic {idx + 1}: {topic}")


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Enable visualization inside Jupyter Notebook
pyLDAvis.enable_notebook()

# Prepare LDA visualization
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Display visualization
pyLDAvis.display(lda_vis)
