In [52]:
import chromadb
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import os
from huggingface_hub import login


In [None]:
# This is to login 
load_dotenv()
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [59]:
# client = chromadb.PersistentClient(path="../databases/sample_news_vectorstore")
client = chromadb.PersistentClient(path="../databases/news")

In [60]:
# Check if the collection exists and delete it if it does
# collection_name = "sample_news"
collection_name = "processed_news"
collection = client.get_or_create_collection(name=collection_name)

In [6]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [61]:
vector = model.encode(["Well hi there"])[0]

In [62]:
vector

array([-9.46715921e-02,  4.27619480e-02,  5.51620498e-02, -5.10962738e-04,
        1.16203260e-02, -6.80130497e-02,  2.76405811e-02,  6.06974475e-02,
        2.88530309e-02, -1.74128171e-02, -4.94346656e-02,  2.30993368e-02,
       -1.28614372e-02, -4.31402922e-02,  2.17510257e-02,  4.26548645e-02,
        5.10499887e-02, -7.79727027e-02, -1.23247243e-01,  3.67455557e-02,
        4.54110606e-03,  9.47937742e-02, -5.53098507e-02,  1.70641206e-02,
       -2.92873308e-02, -4.47124951e-02,  2.06784457e-02,  6.39320165e-02,
        2.27427781e-02,  4.87789772e-02, -2.33503035e-03,  4.72859442e-02,
       -2.86258962e-02,  2.30624825e-02,  2.45130397e-02,  3.95681970e-02,
       -4.33176197e-02, -1.02316625e-01,  2.79874774e-03,  2.39304882e-02,
        1.61556154e-02, -8.99078418e-03,  2.07255688e-02,  6.40123338e-02,
        6.89179525e-02, -6.98360875e-02,  2.89764395e-03, -8.10988992e-02,
        1.71123203e-02,  2.50653620e-03, -1.06529057e-01, -4.87733148e-02,
       -1.67762041e-02, -

In [63]:
import pandas as pd
df = pd.read_csv("../data/processed/processed_news.csv")

In [64]:
df

Unnamed: 0,Category,Link,Title,Description,Sub_category,Author,Published_Date,Published_Date_con,is_https,length_description
0,lifestyle,https://www.news.com.au/lifestyle/real-life/ne...,Woman’s first Lotto ticket in 20 years wins,a lucky perth woman has become an instant mill...,News Life,Emma Kirk,"March 19, 2025 - 9:31PM",2025-03-19 21:31:00,True,139
1,lifestyle,https://www.news.com.au/lifestyle/real-life/ne...,"If you laugh at this video, you’re the problem",a viral trend is currently sweeping social med...,News Life,Rebekah Scanlan,"March 19, 2025 - 11:56AM",2025-03-19 11:56:00,True,131
2,lifestyle,https://www.news.com.au/lifestyle/real-life/ne...,‘Don’t understand’: Act puzzling Aussies,australians are fed up with one particular act...,News Life,Claudia Poposki,"March 18, 2025 - 6:47PM",2025-03-18 18:47:00,True,121
3,entertainment-news,https://www.9news.com/article/entertainment/te...,"'SNL' announces hosts, musical guests for its ...",snl50 the anniversary special was nbc s mos...,"television, news, entertainment-news, entertai...",9news.com,"Wed, 19 Mar 2025 19:24:54 GMT",2025-03-19 19:24:54+00:00,True,103
4,entertainment-news,https://www.9news.com/article/entertainment/mu...,"Lollapalooza 2025 lineup: Olivia Rodrigo, Sabr...",the headliners this year bring a mix of pop h...,"music, entertainment, television, programs, en...",9news.com,"Wed, 19 Mar 2025 18:10:52 GMT",2025-03-19 18:10:52+00:00,True,68
...,...,...,...,...,...,...,...,...,...,...
448,sports,https://www.news.com.au/sport/ufc/ufc-stunner-...,Bombshell UFC announcement for Aussie duo,australia could soon have two ufc champions w...,UFC,Andrew Jackson,"February 20, 2025 - 4:47PM",2025-02-20 16:47:00,True,103
449,sports,https://www.news.com.au/sport/alex-volkanovski...,Biggest month in Aussie UFC history: Two title...,two aussies are preparing for ufc title fights...,Sport,Brendan Bradford,"February 20, 2025 - 4:30PM",2025-02-20 16:30:00,True,135
450,sports,https://www.news.com.au/sport/more-sports/auss...,‘Monster’: Tragic death of troubled Aus legend,an aussie surfing prodigy who crashed out of t...,More Sports,Daniel Peters,"March 18, 2025 - 12:56PM",2025-03-18 12:56:00,True,146
451,sports,https://www.news.com.au/sport/more-sports/cham...,Bodybuilding star drops scary truth bomb,a champion natural bodybuilder has revealed th...,More Sports,Matthew Sullivan,"March 17, 2025 - 6:50PM",2025-03-17 18:50:00,True,126


In [65]:
documents = df['Description'].tolist()
vectors = model.encode(documents).astype(float).tolist()

In [67]:
# Prepare metadata with additional information for each document
metadatas = [{
    "category": row['Category'],
    "sub_category": row['Sub_category'],
    "author": row['Author'],
    "published_date": row["Published_Date"],
    "link": row['Link'],
    "title": row['Title']
} for _, row in df.iterrows()]


In [68]:
# Create unique IDs for each document
ids = [f"doc_{i}" for i in range(len(df))]


In [69]:
collection.add(
    ids=ids,
    documents=documents,
    embeddings=vectors,
    metadatas=metadatas
)

## Visualise the data

In [70]:
import numpy as np

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
categories = [metadata['category'] for metadata in result['metadatas']]
# colors = [COLORS[CATEGORIES.index(c)] for c in categories]

In [71]:
CATEGORIES = list(set(categories))
# COLORS = ['red', 'blue', 'brown', 'orange', 'yellow']

In [72]:
import random

# List of categories
CATEGORIES = ['business', 'entertainment-news', 'lifestyle', 'sports', 'sport', 
              'technology', 'insights & analysis', 'world', 'finance', 'travel', 
              'general', 'entertainment', 'politics', 'investigations', 'national']


# Define a list of classic and distinct colors
COLORS = [
    '#FF5733',  # Red-Orange
    '#33FF57',  # Green
    '#3357FF',  # Blue
    '#FF33A6',  # Pink
    '#FFD700',  # Gold
    '#800080',  # Purple
    '#FF6347',  # Tomato
    '#008080',  # Teal
    '#A52A2A',  # Brown
    '#20B2AA',  # Light Sea Green
    '#FF1493',  # Deep Pink
    '#1E90FF',  # Dodger Blue
    '#4B0082',  # Indigo
    '#2F4F4F',  # Dark Slate Gray
    '#D2691E',  # Chocolate
]





In [73]:
colors = [COLORS[CATEGORIES.index(c)] for c in categories]

In [74]:
from sklearn.manifold import TSNE

In [75]:
tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)

In [76]:
import plotly.graph_objects as go

In [None]:
# Let's try a 2D chart

tsne = TSNE(n_components=2, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)

In [78]:
# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
)])

fig.update_layout(
    title='2D Chroma Vectorstore Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [80]:
# Let's try a 3D chart

tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)
reduced_vectors = tsne.fit_transform(vectors)

In [81]:
# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()