In [1]:
!nodetool status

Datacenter: datacenter1
Status=Up/Down
|/ State=Normal/Leaving/Joining/Moving
--  Address     Load        Tokens  Owns (effective)  Host ID                               Rack 
UN  172.23.0.3  232.27 KiB  16      100.0%            e052f66c-071c-49b5-af26-649090b33406  rack1
UN  172.23.0.4  228.83 KiB  16      100.0%            99665ca9-df67-4e1c-b77c-71fd8947e936  rack1
UN  172.23.0.2  227.01 KiB  16      100.0%            571d4ea2-5fcc-45c4-9cb0-d3cea73296ac  rack1



In [2]:
#Connect to the Cassandra cluster
from cassandra.cluster import Cluster
cluster = Cluster(['p6-db-1', 'p6-db-2', 'p6-db-3'])
cass = cluster.connect()

In [3]:
#q1

In [4]:
from cassandra.query import SimpleStatement

cluster = Cluster(['p6-db-1', 'p6-db-2', 'p6-db-3'])
cass = cluster.connect()

cass.execute("DROP KEYSPACE IF EXISTS weather")

cass.execute("""
    CREATE KEYSPACE weather 
    WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '3'}
""")

cass.set_keyspace('weather')

cass.execute("""
    CREATE TYPE station_record (
        tmin int,
        tmax int
    )
""")

cass.execute("""
    CREATE TABLE stations (
        id text,
        name text static,
        date date,
        record station_record,
        PRIMARY KEY (id, date)
    ) WITH CLUSTERING ORDER BY (date ASC)
""")

print(cass.execute("describe table weather.stations ").one().create_statement)

CREATE TABLE weather.stations (
    id text,
    date date,
    name text static,
    record station_record,
    PRIMARY KEY (id, date)
) WITH CLUSTERING ORDER BY (date ASC)
    AND additional_write_policy = '99p'
    AND bloom_filter_fp_chance = 0.01
    AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}
    AND cdc = false
    AND comment = ''
    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
    AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
    AND memtable = 'default'
    AND crc_check_chance = 1.0
    AND default_time_to_live = 0
    AND extensions = {}
    AND gc_grace_seconds = 864000
    AND max_index_interval = 2048
    AND memtable_flush_period_in_ms = 0
    AND min_index_interval = 128
    AND read_repair = 'BLOCKING'
    AND speculative_retry = '99p';


In [5]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("p6")
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.0')
         .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")
         .getOrCreate())

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.datastax.spark#spark-cassandra-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6b9dcc44-d02d-4c6c-896c-90e0cb2dc9a9;1.0
	confs: [default]
	found com.datastax.spark#spark-cassandra-connector_2.12;3.4.0 in central
	found com.datastax.spark#spark-cassandra-connector-driver_2.12;3.4.0 in central
	found com.datastax.oss#java-driver-core-shaded;4.13.0 in central
	found com.datastax.oss#native-protocol;1.5.0 in central
	found com.datastax.oss#java-driver-shaded-guava;25.1-jre-graal-sub-1 in central
	found com.typesafe#config;1.4.1 in central
	found org.slf4j#slf4j-api;1.7.26 in central
	found io.dropwizard.metrics#metrics-core;4.1.18 in central
	found org.hdrhistogram#HdrHistogram;2.1.12 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found com.github.stephenc.jcip#jcip-annotations;1.0-1 in central
	found com.gith

In [6]:
from pyspark.sql.functions import substring

spark = (SparkSession.builder
         .appName("p6")
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.12:3.4.0')
         .config("spark.sql.extensions", "com.datastax.spark.connector.CassandraSparkExtensions")
         .getOrCreate())

stations = spark.read.text("ghcnd-stations.txt")

stations = stations.select(
    substring(stations['value'], 1, 11).alias('id'),
    substring(stations['value'], 39, 2).alias('state'),
    substring(stations['value'], 42, 30).alias('name')
)

stations = stations.filter(stations['state'] == 'WI')

for row in stations.collect():
    cass.execute(
        """
        INSERT INTO stations (id, name)
        VALUES (%s, %s)
        """,
        (row['id'], row['name'])
    )

                                                                                

In [7]:
rows = cass.execute("SELECT COUNT(*) FROM weather.stations")

for row in rows:
    print(row[0])

1313


In [8]:
#q2

In [9]:
rows = cass.execute("SELECT name FROM stations WHERE id = 'USW00014837'")

for row in rows:
    print(row.name)

MADISON DANE CO RGNL AP       


In [10]:
#q3

In [11]:
rows = cass.execute("SELECT TOKEN(id) FROM stations WHERE id = 'USC00470273'")

for row in rows:
    print(row[0])

-9014250178872933741


In [12]:
#q4

In [13]:
import subprocess

# Get the token for USC00470273
rows = cass.execute("SELECT TOKEN(id) FROM stations WHERE id = 'USC00470273'")
usc_token = rows[0][0]

# Run nodetool ring
output = subprocess.check_output(["nodetool", "ring"]).decode()

# Parse the output
lines = output.split("\n")[4:-1]  # Remove the header and footer
tokens = [int(line.split()[7]) for line in lines]  # Change the index to get the token

# Sort the tokens
tokens.sort()

# Find the token that comes after the token for USC00470273
next_token = None
for token in tokens:
    if token > usc_token:
        next_token = token
        break

# Handle the case where the ring "wraps around"
if next_token is None:
    next_token = tokens[0]

print(next_token)


  usc_token = rows[0][0]


IndexError: list index out of range

In [14]:
#q5

In [17]:
import grpc
import station_pb2
import station_pb2_grpc

# Open a gRPC channel
channel = grpc.insecure_channel('localhost:5440')

# Create a stub (client)
stub = station_pb2_grpc.StationStub(channel)

# Create a valid request message
station_max_request = station_pb2.StationMaxRequest(station='USW00014837')

# Make the call
response = stub.StationMax(station_max_request)

response.tmax


_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused"
	debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused {created_time:"2023-11-22T19:57:48.054030763+00:00", grpc_status:14}"
>

In [18]:
# Create a DataFrame that corresponds to the stations table in Cassandra
df = spark.read.format("org.apache.spark.sql.cassandra")\
    .option("spark.cassandra.connection.host", "p6-db-1,p6-db-2,p6-db-3")\
    .option("keyspace", "weather")\
    .option("table", "stations")\
    .load()

# Create a temporary view named stations
df.createOrReplaceTempView("stations")

In [19]:
#q6

In [20]:
# List the tables/views available in the Spark catalog
tables = spark.catalog.listTables()

# Print the tables/views
for table in tables:
    print(table.name)

stations


In [21]:
#q7

In [23]:
from pyspark.sql.functions import avg

# Calculate the average difference between tmax and tmin for each station
result = spark.sql("""
    SELECT id, AVG(record.tmax - record.tmin) as avg_diff
    FROM stations
    WHERE record.tmax IS NOT NULL AND record.tmin IS NOT NULL
    GROUP BY id
""")

# Convert the result to a dictionary
result_dict = {row['id']: row['avg_diff'] for row in result.collect()}

result_dict



{}

In [25]:
#q8

In [27]:
!docker exec -it p6-db-1 nodetool status

/usr/bin/sh: 1: docker: not found


In [28]:
#q9

In [29]:
import grpc
import station_pb2
import station_pb2_grpc

# Open a gRPC channel
channel = grpc.insecure_channel('localhost:5440')

# Create a stub (client)
stub = station_pb2_grpc.StationStub(channel)

# Create a valid request message
station_max_request = station_pb2.StationMaxRequest(station='USW00014837')

# Make the call
response = stub.StationMax(station_max_request)

response.error


_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused"
	debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused {grpc_status:14, created_time:"2023-11-22T20:09:46.038250423+00:00"}"
>

In [None]:
#q10

In [32]:
# Open a gRPC channel
channel = grpc.insecure_channel('localhost:5440')

# Create a stub (client)
stub = station_pb2_grpc.StationStub(channel)

# Create a valid request message
record_temps_request = station_pb2.RecordTempsRequest(
    station='USW00014837',
    date='2023-11-22',
    tmin=10,
    tmax=20
)

# Make the call
response = stub.RecordTemps(record_temps_request)

response.error

_InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused"
	debug_error_string = "UNKNOWN:failed to connect to all addresses; last error: UNKNOWN: ipv4:127.0.0.1:5440: Failed to connect to remote host: Connection refused {grpc_status:14, created_time:"2023-11-22T20:12:17.753643897+00:00"}"
>