# Setup

In [30]:
CASSANDRA_START_FROM_SCRATCH = True
DOCKER_INTERNAL_HOST = "host.docker.internal"
DOCKER_DNS = ["10.15.30.1"]

CASSANDRA_CLUSTER_NAME = "cassandra-cluster.jsanti30.vpn.itam.mx"
CASSANDRA_TOTAL_NODES = 3

CASSANDRA_NODE_IPS = ["10.15.20.32"] * CASSANDRA_TOTAL_NODES
CASSANDRA_NODE_NAMES = [f"cassandra-node-{i+1}" for i in range(CASSANDRA_TOTAL_NODES)]
CASSANDRA_NODE_HOSTNAMES = [
    f"{CASSANDRA_NODE_NAMES[i]}.jsanti30.vpn.itam.mx"
    for i in range(CASSANDRA_TOTAL_NODES)
]
CASSANDRA_NODE_GOSSIP_PORTS = [7000 + (i + 1) for i in range(CASSANDRA_TOTAL_NODES)]
CASSANDRA_NODE_RPC_PORTS = [9040 + (i + 1) for i in range(CASSANDRA_TOTAL_NODES)]
CASSANDRA_NODE_SSL_GOSSIP_PORTS = [7500 + (i + 1) for i in range(CASSANDRA_TOTAL_NODES)]
CASSANDRA_NODE_JMX_PORTS = [7200 + (i + 1) for i in range(0, CASSANDRA_TOTAL_NODES)]

CASSANDRA_CA_CERT_PASSWORD = "cassandra_cluster_ca_cert_passowrd"
CASSANDRA_NODE_CERT_PASSWORD = "cassandra_cluster_cert_passowrd"
CASSANDRA_INIT_USER = "cassandra"
CASSANDRA_INIT_PASSWORD = "cassandra"

CASSANDRA_WORKDIR = "/var/lib/cassandra"

In [31]:
import os
from pathlib import Path

LOCALHOST_WORKDIR = f"{os.path.join(os.path.relpath(Path.cwd()))}"
DOCKER_MOUNTDIR = os.path.join(LOCALHOST_WORKDIR, "mount")
CASSANDRA_LOCALHOST_CLUSTER_CA_CERTDIR = os.path.join(LOCALHOST_WORKDIR, "cluster_certs")

mount_path = Path(DOCKER_MOUNTDIR)
mount_path.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("CQLENG_ALLOW_SCHEMA_MANAGEMENT", "True")

'True'

# Session creation

In [32]:
from cassandra.cluster import Cluster
from cassandra.connection import DefaultEndPoint
from cassandra.auth import PlainTextAuthProvider
from cassandra.query import dict_factory

cassandra_nodes = [
    DefaultEndPoint(CASSANDRA_NODE_HOSTNAMES[j], CASSANDRA_NODE_RPC_PORTS[j])
    for j in range(CASSANDRA_TOTAL_NODES)
]
print(
    f"üîó Connecting to: {[f"{cassandra_node.address}:{cassandra_node.port}" for cassandra_node in cassandra_nodes]}"
)
print(
    f"JDBC URL: jdbc:cassandra://{','.join([f"{cassandra_node.address}:{cassandra_node.port}" for cassandra_node in cassandra_nodes])}"
)

auth_provider = PlainTextAuthProvider(
    username=CASSANDRA_INIT_USER, password=CASSANDRA_INIT_PASSWORD
)
cluster = Cluster(contact_points=cassandra_nodes, auth_provider=auth_provider)

session = cluster.connect()
session.row_factory = dict_factory
print(f"‚úÖ Connected to cluster: {cluster.metadata.cluster_name}")
print(f"üåê Nodes found: {len(cluster.metadata.all_hosts())}")

üîó Connecting to: ['cassandra-node-1.jsanti30.vpn.itam.mx:9041', 'cassandra-node-2.jsanti30.vpn.itam.mx:9042', 'cassandra-node-3.jsanti30.vpn.itam.mx:9043']
JDBC URL: jdbc:cassandra://cassandra-node-1.jsanti30.vpn.itam.mx:9041,cassandra-node-2.jsanti30.vpn.itam.mx:9042,cassandra-node-3.jsanti30.vpn.itam.mx:9043
‚úÖ Connected to cluster: cassandra-cluster.jsanti30.vpn.itam.mx
üåê Nodes found: 2


In [33]:
import pprint
import pandas as pd
from IPython.display import display

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)
pd.options.display.html.use_mathjax = True
pd.options.display.html.border = 1
pd.options.display.html.table_schema = False
pd.options.display.expand_frame_repr = True

display(
    pd.DataFrame(
        [cassandra_host.__dict__ for cassandra_host in cluster.metadata.all_hosts()]
    )
    .sort_index()
    .transpose()
)

Unnamed: 0,0,1
endpoint,cassandra-node-3.jsanti30.vpn.itam.mx:9043,10.15.20.32:9042
conviction_policy,<cassandra.policies.SimpleConvictionPolicy object at 0x00000208D91856A0>,<cassandra.policies.SimpleConvictionPolicy object at 0x00000208D9185480>
host_id,4101b01e-284b-4ae0-b8ec-ae47cc87611e,0ba622d3-0c43-42dc-84df-4f2030b156df
_datacenter,dc1,dc1
_rack,rack1,rack1
lock,<unlocked _thread.RLock object owner=0 count=0 at 0x00000208D9825D00>,<unlocked _thread.RLock object owner=0 count=0 at 0x00000208D954DBC0>
is_up,True,False
listen_address,172.19.0.4,
listen_port,7003.0,
broadcast_address,10.15.20.32,10.15.20.32


In [None]:
# from cassandra.cluster import ResultSet

# peers_v2: ResultSet = session.execute("SELECT * FROM system.peers_v2")
# locals: ResultSet = session.execute("SELECT * FROM system.local")

# for peer_v2 in peers_v2:
#     print(" | ".join([f"{col}: {peer_v2[col]}" for col in peers_v2.column_names]))
# print("=" * 100)
# for local in locals:
#     print(" | ".join([f"{col}: {local[col]}" for col in locals.column_names]))
# print("=" * 100)

Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2
Total nodes (seg√∫n Cassandra): 2


KeyboardInterrupt: 

### Keyspace

In [34]:
keyspace_name = "generic_analytics"
if CASSANDRA_START_FROM_SCRATCH:
    session.execute(f"DROP KEYSPACE IF EXISTS {keyspace_name}")

OperationTimedOut: errors={'cassandra-node-3.jsanti30.vpn.itam.mx:9043': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=cassandra-node-3.jsanti30.vpn.itam.mx:9043

In [28]:
keyspace_name = "generic_analytics"
session.execute(
    f"""
CREATE KEYSPACE IF NOT EXISTS {keyspace_name} 
WITH replication = {{
    'class': 'NetworkTopologyStrategy', 
    'dc1': {CASSANDRA_TOTAL_NODES}
}}
"""
)
session.set_keyspace(keyspace_name)

# Create Table
# Partition Key: city (distributes data)
# Clustering Column: user_id (sorts data within city)
session.execute(
    """
CREATE TABLE IF NOT EXISTS user_metrics (
    city text,
    user_id uuid,
    username text,
    session_duration int,
    last_access timestamp,
    PRIMARY KEY (city, user_id)
);
"""
)

OperationTimedOut: errors={'cassandra-node-3.jsanti30.vpn.itam.mx:9043': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=cassandra-node-3.jsanti30.vpn.itam.mx:9043

In [29]:
import time
from cassandra import InvalidRequest

keyspace_name = "generic_analytics"

# Asegura que exista (no falla si ya existe)
session.execute(f"""
CREATE KEYSPACE IF NOT EXISTS {keyspace_name}
WITH replication = {{
  'class': 'NetworkTopologyStrategy',
  'dc1': {int(CASSANDRA_TOTAL_NODES)}
}};
""")

session.cluster.control_connection.wait_for_schema_agreement()

# Reintento idempotente
for _ in range(10):
    try:
        session.execute(f"USE {keyspace_name};")  # equivalente a set_keyspace, pero m√°s simple
        break
    except InvalidRequest:
        time.sleep(0.5)
        session.cluster.control_connection.wait_for_schema_agreement()





OperationTimedOut: errors={'cassandra-node-3.jsanti30.vpn.itam.mx:9043': 'Client request timeout. See Session.execute[_async](timeout)'}, last_host=cassandra-node-3.jsanti30.vpn.itam.mx:9043

### Insert

In [15]:
from faker import Faker

fake = Faker()

In [16]:
import uuid
from datetime import datetime
from cassandra.query import SimpleStatement
from cassandra import ConsistencyLevel
from cassandra.query import BatchStatement, BatchType

# 1. Prepare your statement outside the loop
query = """
INSERT INTO user_metrics (city, user_id, username, session_duration, last_access)
VALUES (?, ?, ?, ?, ?)
"""
prepared = session.prepare(query)
# prepared.consistency_level = ConsistencyLevel.QUORUM
# statement = SimpleStatement(query, consistency_level=ConsistencyLevel.QUORUM)

# 2. Create the Batch object
# LOGGED ensures atomicity but adds disk overhead, UNLOGGED is faster
batch_records = 200
batch = BatchStatement(batch_type=BatchType.LOGGED)
batch.consistency_level = ConsistencyLevel.QUORUM
print(f"Preparing batch of {batch_records} records...")
for _ in range(batch_records):
    batch.add(
        prepared,
        (
            fake.city(),
            uuid.uuid4(),
            fake.user_name(),
            fake.random_int(min=1, max=3600),
            datetime.now(),
        ),
    )

# 3. Execute the entire batch at once
session.execute(batch)
print("Batch successfully committed to the cluster.")

Preparing batch of 200 records...
Batch successfully committed to the cluster.


### Query

In [17]:
from cassandra.cluster import ResultSet
from typing import cast

print(cast(ResultSet, session.execute("SELECT count(*) FROM user_metrics")).one())

{'count': 200}


In [18]:
from cassandra.cluster import ResultSet

rows: ResultSet = session.execute("SELECT * FROM user_metrics LIMIT 100")
for row in rows.current_rows:
    print(row)

{'city': 'Andreashire', 'user_id': UUID('f2f89890-f6af-4ffe-aa61-0f086e783271'), 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 657000), 'session_duration': 2832, 'username': 'bradleyjulie'}
{'city': 'North Taylor', 'user_id': UUID('6440e8b7-e881-47e7-a1e7-ca6ba87af585'), 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 677000), 'session_duration': 427, 'username': 'stanleyjuan'}
{'city': 'North Trevorhaven', 'user_id': UUID('94c5e907-6b85-4348-aa84-f7bd10c32520'), 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 636000), 'session_duration': 3298, 'username': 'luis05'}
{'city': 'North Williammouth', 'user_id': UUID('9348b906-d2a5-4713-93af-cf8f438440e7'), 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 674000), 'session_duration': 761, 'username': 'qthomas'}
{'city': 'East Jessicastad', 'user_id': UUID('90f87eec-b2e6-44da-a495-a850247e7b0c'), 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 598000), 'session_duration': 3459, 'username': '

In [19]:
from cassandra.cluster import ResultSet

rows: ResultSet = session.execute("SELECT DISTINCT city FROM user_metrics", [])
for row in rows.current_rows:
    print(row)

{'city': 'Andreashire'}
{'city': 'North Taylor'}
{'city': 'North Trevorhaven'}
{'city': 'North Williammouth'}
{'city': 'East Jessicastad'}
{'city': 'West Leon'}
{'city': 'East Thomasmouth'}
{'city': 'Elizabethville'}
{'city': 'Port Chelsea'}
{'city': 'Carlosberg'}
{'city': 'New Zachary'}
{'city': 'Rickystad'}
{'city': 'Christinashire'}
{'city': 'North Mark'}
{'city': 'Anthonyview'}
{'city': 'South Hollystad'}
{'city': 'Jonesmouth'}
{'city': 'Lake Jennifer'}
{'city': 'North Charlesland'}
{'city': 'Nathanport'}
{'city': 'Kathychester'}
{'city': 'Nicoleberg'}
{'city': 'East Kiara'}
{'city': 'New Adamland'}
{'city': 'Jonesfurt'}
{'city': 'Lake Nathantown'}
{'city': 'Jeffreymouth'}
{'city': 'Ayersbury'}
{'city': 'Rodriguezberg'}
{'city': 'Pamelaborough'}
{'city': 'Port Thomas'}
{'city': 'Escobarchester'}
{'city': 'Lake Jeffport'}
{'city': 'Matthewsstad'}
{'city': 'Port Abigail'}
{'city': 'Bakershire'}
{'city': 'East Tylerchester'}
{'city': 'Rhodeston'}
{'city': 'East Jenna'}
{'city': 'Moore

### Find nodes storing random data

In [20]:
import random
import pprint

random_token = random.randint(-9223372036854775808, 9223372036854775807)
query = "SELECT * FROM user_metrics WHERE token(city) >= %s LIMIT 1"
row: dict = cast(ResultSet, session.execute(query, [random_token])).one()
print(f"Random user_metric: {pprint.pformat(row)}")

prepared = session.prepare("SELECT * FROM user_metrics WHERE city=? AND user_id=?")
bound = prepared.bind([row['city'], row['user_id']])
routing_key = bound.routing_key
nodes = cluster.metadata.get_replicas(keyspace_name, routing_key)

print(f"Nodes storing '{row['city']}':")
for node in nodes:
    print(f" - Host: {node.address}, Gossip Port: {node.broadcast_port}")

Random user_metric: {'city': 'Meaganmouth',
 'last_access': datetime.datetime(2026, 2, 13, 17, 23, 41, 671000),
 'session_duration': 1398,
 'user_id': UUID('98f19a8d-0e06-4178-a24f-45953d37f139'),
 'username': 'kimberlyrodriguez'}
Nodes storing 'Meaganmouth':
 - Host: cassandra-node-3.jsanti30.vpn.itam.mx, Gossip Port: 7003
 - Host: 10.15.20.32, Gossip Port: 7002


### ORM-like

In [22]:
from typing import cast
from cassandra.cqlengine import columns
from cassandra.cqlengine.query import ModelQuerySet
from cassandra.cqlengine.models import Model
from cassandra.cqlengine.management import sync_table, create_keyspace_network_topology
from cassandra.cqlengine import connection

# 1. Connect the engine
connection.set_session(session)


# 2. Define your "Generic" Model
class UserMetrics(Model):
    __table_name__ = "user_metrics"

    # FIRST primary_key=True is the Partition Key
    # city = columns.Text(primary_key=True)
    city = columns.Text(primary_key=True, partition_key=True)

    # SECOND primary_key=True is the Clustering Key
    user_id = columns.UUID(primary_key=True, default=uuid.uuid4)

    # Attributes (Data)
    username = columns.Text(index=True)
    session_duration = columns.Integer()
    last_access = columns.DateTime()


# 3. Create Keyspace and Sync Table (Equivalent to CREATE TABLE)
create_keyspace_network_topology(keyspace_name, {"dc1": CASSANDRA_TOTAL_NODES})
session.cluster.control_connection.wait_for_schema_agreement()
sync_table(UserMetrics)

In [23]:
# 4. Use it like an ORM
# Create UserMetrics
new_metric: UserMetrics = UserMetrics.create(
    city=fake.city(),
    username=fake.name(),
    session_duration=120,
    last_access=datetime.now(),
)
new_metric.save()
print(f"Saved user_metris: {new_metric}")


# 5. Query UserMetrics
user_metrics = (
    cast(ModelQuerySet, UserMetrics.objects())
    .filter(session_duration__gte=120)
    .allow_filtering() # Non-recommended, used for non-primary key queries
)
for user_metric in user_metrics:
    print(f"Query user_metris: {user_metric}")

Saved user_metris: UserMetrics <city=East Amber, user_id=9a66d341-dfcd-4056-8493-4e5a3f66f756>
Query user_metris: UserMetrics <city=Andreashire, user_id=f2f89890-f6af-4ffe-aa61-0f086e783271>
Query user_metris: UserMetrics <city=North Taylor, user_id=6440e8b7-e881-47e7-a1e7-ca6ba87af585>
Query user_metris: UserMetrics <city=North Trevorhaven, user_id=94c5e907-6b85-4348-aa84-f7bd10c32520>
Query user_metris: UserMetrics <city=North Williammouth, user_id=9348b906-d2a5-4713-93af-cf8f438440e7>
Query user_metris: UserMetrics <city=East Jessicastad, user_id=90f87eec-b2e6-44da-a495-a850247e7b0c>
Query user_metris: UserMetrics <city=West Leon, user_id=1d1db621-6bf8-4ef0-8944-64a0ba77c9f9>
Query user_metris: UserMetrics <city=East Thomasmouth, user_id=539acee6-71ea-4069-a5df-2dd0b47490bc>
Query user_metris: UserMetrics <city=Elizabethville, user_id=d9f240f3-0087-4ad2-b8fb-1bfe46cf7c1c>
Query user_metris: UserMetrics <city=Port Chelsea, user_id=54adfe01-d281-4427-bfbf-e9ffd87722c9>
Query user_metr

### Insert/Create

In [24]:
from typing import cast

cast(ModelQuerySet, UserMetrics.ttl(86400)).create(
    city=fake.city(),
    username=fake.user_name(),
    session_duration=fake.random_int(10, 1000),
    last_access=fake.date_time(),
)

UserMetrics(city='Mathewfort', user_id=UUID('cb6cc1d3-43df-4d1c-924a-2bf6cadf59ff'), username='darrylmendez', session_duration=261, last_access=datetime.datetime(1988, 2, 7, 22, 39, 23))

In [25]:
from typing import cast
from cassandra.cqlengine.query import BatchQuery
from cassandra.cqlengine.query import ModelQuerySet

with BatchQuery() as b:
    for _ in range(100):
        cast(ModelQuerySet, UserMetrics.batch(b)).create(
            city=fake.city(),
            username=fake.user_name(),
            session_duration=fake.random_int(10, 1000),
            last_access=fake.date_time(),
        )

### Delete

In [26]:
user_metrics = (
    cast(ModelQuerySet, UserMetrics.objects())
    .filter(session_duration__gte=120)
    .allow_filtering() # Non-recommended, used for non-primary key queries
)
for user_metric in user_metrics:
    user_metric.delete()
    print(f"Delete user_metris: {user_metric}")

Delete user_metris: UserMetrics <city=Chavezshire, user_id=4120b54c-44ff-4175-b9db-abfdcd71aad2>
Delete user_metris: UserMetrics <city=Andreashire, user_id=f2f89890-f6af-4ffe-aa61-0f086e783271>
Delete user_metris: UserMetrics <city=North Taylor, user_id=6440e8b7-e881-47e7-a1e7-ca6ba87af585>
Delete user_metris: UserMetrics <city=North Trevorhaven, user_id=94c5e907-6b85-4348-aa84-f7bd10c32520>
Delete user_metris: UserMetrics <city=North Williammouth, user_id=9348b906-d2a5-4713-93af-cf8f438440e7>
Delete user_metris: UserMetrics <city=East Jessicastad, user_id=90f87eec-b2e6-44da-a495-a850247e7b0c>
Delete user_metris: UserMetrics <city=West Leon, user_id=1d1db621-6bf8-4ef0-8944-64a0ba77c9f9>
Delete user_metris: UserMetrics <city=East Thomasmouth, user_id=539acee6-71ea-4069-a5df-2dd0b47490bc>
Delete user_metris: UserMetrics <city=Lake Tiffanychester, user_id=6cd9e818-e7ff-4188-b2eb-23cfa407e0e4>
Delete user_metris: UserMetrics <city=Elizabethville, user_id=d9f240f3-0087-4ad2-b8fb-1bfe46cf7c1

In [None]:
# from cassandra.cqlengine import connection

# # This isn't a compaction, but it forces RAM data to disk so the background compaction process can see it.
# connection.execute(f"ALTER TABLE {keyspace_name}.user_metrics WITH gc_grace_seconds = 10")

: 

: 