## Libraries importing

In [None]:
import re
import spacy 
import gensim
from gensim.models import Word2Vec
from collections import Counter
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from itertools import chain
from sklearn.metrics import silhouette_score



## Data creation 

In [1]:
data = [
    "Data science is very important; it can help in many areas.",
    "The data I worked with was mainly for health and insurance purposes.",
    "I heard the data was corrupted before it was used.",
    "No data could have been applied in any of this training.",
    "Big datasets often require preprocessing before analysis.",
    "Machine learning models depend heavily on the quality of data.",
    "Sometimes missing data causes serious problems in predictions.",
    "Raw data usually contains noise and irrelevant information.",
    "Data cleaning is an essential step in any data science pipeline.",
    "Structured and unstructured data are both valuable for insights."
]

## Preprocessing

In [None]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

processed_data = []
for text in data:
    # Clean: lowercase + keep only letters and spaces
    cleaned_text = re.sub(r'[^a-z\s]', ' ', text.lower())

    # Tokenize with spaCy
    doc = nlp(cleaned_text)

    # Lemmatize, remove stopwords, remove very short tokens (like 'a', 'is')
    tokens = [token.lemma_ for token in doc 
              if not token.is_stop and token.is_alpha and len(token) > 2]

    processed_data.append(" ".join(tokens))

print(processed_data)


['data science important help area', 'datum work mainly health insurance purpose', 'hear datum corrupt', 'datum apply training', 'big dataset require preprocesse analysis', 'machine learning model depend heavily quality datum', 'miss datum cause problem prediction', 'raw datum usually contain noise irrelevant information', 'datum cleaning essential step data science pipeline', 'structured unstructured datum valuable insight']


In [None]:
# Bag of words
vecotrizer = CountVectorizer()
x_data =  vecotrizer.fit_transform(processed_data)

df_vec = pd.DataFrame(
    x_data.toarray(),
    columns=vecotrizer.get_feature_names_out()
)
df_vec

Unnamed: 0,analysis,apply,area,big,cause,cleaning,contain,corrupt,data,dataset,datum,depend,essential,health,hear,heavily,help,important,information,insight,insurance,irrelevant,learning,machine,mainly,miss,model,noise,pipeline,prediction,preprocesse,problem,purpose,quality,raw,require,science,step,structured,training,unstructured,usually,valuable,work
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0


In [None]:
tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(processed_data)

df = pd.DataFrame(
    x_tfidf.toarray(),
    columns=tfidf.get_feature_names_out()
)


In [19]:
df

Unnamed: 0,analysis,apply,area,big,cause,cleaning,contain,corrupt,data,dataset,datum,depend,essential,health,hear,heavily,help,important,information,insight,insurance,irrelevant,learning,machine,mainly,miss,model,noise,pipeline,prediction,preprocesse,problem,purpose,quality,raw,require,science,step,structured,training,unstructured,usually,valuable,work
0,0.0,0.0,0.474295,0.0,0.0,0.0,0.0,0.0,0.403194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.474295,0.474295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.403194,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.194723,0.0,0.0,0.438653,0.0,0.0,0.0,0.0,0.0,0.0,0.438653,0.0,0.0,0.0,0.438653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438653
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674651,0.0,0.0,0.299486,0.0,0.0,0.0,0.674651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.674651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.299486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674651,0.0,0.0,0.0,0.0
4,0.447214,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178322,0.401705,0.0,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.401705,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.488121,0.0,0.0,0.0,0.0,0.0,0.216683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488121,0.0,0.0,0.0,0.488121,0.0,0.488121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0,0.0,0.178322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0,0.0,0.0,0.0,0.0,0.401705,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.420988,0.0,0.0,0.357878,0.0,0.186882,0.0,0.420988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.420988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357878,0.420988,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488121,0.0,0.488121,0.0,0.488121,0.0


In [None]:
#  Word2Vec on the tokens 
tokens = [s.split() for s in processed_data]         
print("Example token list:", tokens[0][:10])

w2v = Word2Vec(
    sentences=tokens,
    vector_size=100,
    window=4,
    min_count=1,  
    sg=1,         
    workers=1,
    epochs=200
)

#  Plot the words from the tokens
freq = Counter(chain.from_iterable(tokens))
unique_words = list(freq.keys())
words_in_vocab = [w for w in unique_words if w in w2v.wv.key_to_index]

# Sanity check: if you ever see single letters here, your tokens are wrong
assert all(len(w) > 1 for w in words_in_vocab if w.isalpha()), "Got single-letter tokens—check tokenization!"

vectors = np.vstack([w2v.wv[w] for w in words_in_vocab])

pca = PCA(n_components=3, random_state=42)
coords3d = pca.fit_transform(vectors)
explained = (pca.explained_variance_ratio_ * 100).round(1)

sizes = np.array([freq[w] for w in words_in_vocab])
marker_size = 6 + 2 * (sizes - sizes.min()) / (sizes.ptp() + 1e-9)

fig = px.scatter_3d(
    x=coords3d[:, 0], y=coords3d[:, 1], z=coords3d[:, 2],
    text=words_in_vocab, hover_name=words_in_vocab,
    hover_data={"freq": sizes},
    title=f"Word2Vec (trained on your tokens) — PCA to 3D "
          f"(PC1 {explained[0]}%, PC2 {explained[1]}%, PC3 {explained[2]}%)"
)
fig.update_traces(mode="markers+text", textposition="top center",
                  marker=dict(size=marker_size))
fig.update_layout(
    scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3"),
    margin=dict(l=0, r=0, t=60, b=0)
)
fig.show()

Example token list: ['data', 'science', 'important', 'help', 'area']


In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb = model.encode(data, normalize_embeddings=True)  # shape: (n, 384)
pca = PCA(n_components=3, random_state=42)
emb3d = pca.fit_transform(emb)

# Explained variance
explained = (pca.explained_variance_ratio_ * 100).round(1)
fig = px.scatter_3d(
    x=emb3d[:, 0], y=emb3d[:, 1], z=emb3d[:, 2],
    text=[f"s{i}" for i in range(len(data))],     # short labels on points
    hover_name=[i for i in data],
    hover_data={"sentence": data},
)

fig.update_traces(mode="markers+text", textposition="top center")
fig.update_layout(
    title=f"MiniLM Sentence Embeddings — PCA to 3D "
          f"(PC1 {explained[0]}%, PC2 {explained[1]}%, PC3 {explained[2]}%)",
    scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3"),
    margin=dict(l=0, r=0, t=60, b=0),
)
fig.show()


NameError: name 'data' is not defined

In [None]:
k = 3

# 1) Cluster in embedding space (cosine-normalized already)
kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
labels = kmeans.fit_predict(emb)

# 2) Quality + sizes
sil = silhouette_score(emb, labels)  # Euclidean on normalized vectors ~ cosine-ish
sizes = np.bincount(labels)

print(f"KMeans(k={k}) — silhouette: {sil:.3f}")
for c, n in enumerate(sizes):
    print(f"  cluster {c}: {n} sentences")

# 3) Project sentence embeddings AND cluster centers to the same 3D space
emb3d = pca.transform(emb)                     # from your previous PCA fit
centers3d = pca.transform(kmeans.cluster_centers_)

short_labels = [
    " ".join(s.split()[:5]) + ("…" if len(s.split()) > 5 else "")
    for s in data
]

# 4) Plot points
fig = px.scatter_3d(
    x=emb3d[:, 0], y=emb3d[:, 1], z=emb3d[:, 2],
    color=labels.astype(str),
    text=short_labels,
    hover_name=short_labels,
    hover_data={"sentence": data, "cluster": labels.astype(str)},
    title=f"MiniLM Sentence Embeddings — PCA(3D) + KMeans(k={k})"
)
fig.update_traces(mode="markers+text", textposition="top center")

# 5) Add cluster centers as larger X markers
fig.add_scatter3d(
    x=centers3d[:, 0], y=centers3d[:, 1], z=centers3d[:, 2],
    mode="markers+text",
    text=[f"C{c}" for c in range(k)],
    textposition="middle center",
    marker=dict(size=10, symbol="x"),
    name="centers",
    showlegend=True
)

fig.update_layout(margin=dict(l=0, r=0, t=60, b=0),
                  scene=dict(xaxis_title="PC1", yaxis_title="PC2", zaxis_title="PC3"))
fig.show()

KMeans(k=3) — silhouette: 0.024
  cluster 0: 1 sentences
  cluster 1: 7 sentences
  cluster 2: 2 sentences
