# 1.4_data_modeling_cassandra_query
<img src="https://upload.wikimedia.org/wikipedia/commons/5/5e/Cassandra_logo.svg" width="100" height="100">

In [None]:
#! pip install cassandra-driver

# Import Apache Cassandra python package
import cassandra
from cassandra.cluster import Cluster

# Connect to the cluster
try: 
    cluster = Cluster(['127.0.0.1']) # If running locally
    session = cluster.connect()
except Exception as e:
    print(e)

# Create Keyspace
try:
    session.execute("""
        CREATE KEYSPACE IF NOT EXISTS udacity 
        WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }
    """)
except Exception as e:
    print(e)
# Replication Strategy and factor information are covered in depth in Lesson 3. Remember, this will be the strategy and replication factor on a one node local instance. 

# Set keyspace
try:
    session.set_keyspace("udacity")
except Exception as e:
    print(e)

# Data Modeling in Cassandra is QUERY-FOCUSED, and that focus is on WHERE clause.

Query: get every album in the music library that was created by a given artist  `SELECT * FROM artist_library WHERE year=1965`

<img src="images/1.4_data_modeling_cassandra_query.jpg" width="30%">

In [None]:
# 1st attempt: CREATE TABLE with simple primary key (artist_name)
query = "CREATE TABLE IF NOT EXISTS music_library "
query = query + "(year int, city text, artist_name text, album_name text, PRIMARY KEY (year))"
try:
    session.execute(query)
except Exception as e:
    print(e)

# INSERT INTO
query = "INSERT INTO music_library (year, artist_name, album_name, city)"
query = query + " VALUES (%s, %s, %s, %s)"

try:
    session.execute(query, (1970, "The Beatles", "Let It Be", "Liverpool"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Beatles", "Rubber Soul", "Oxford"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Who", "My Generation", "London"))
except Exception as e:
    print(e)

try:
    session.execute(query, (1966, "The Monkees", "The Monkees", "Los Angeles"))
except Exception as e:
    print(e)

try:
    session.execute(query, (1970, "The Carpenters", "Close To You", "San Diego"))
except Exception as e:
    print(e)

# QUERY
query = "SELECT * FROM music_library WHERE year=1965"
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.artist_name, row.album_name, row.city)

Is the output `1965 The Who My Generation London`? How come when there should also be `1965 The Beatles Rubber Soul Oxford`?  
Cassandra doesn't allow duplicate rows, and since year is the primary key, the table accepts only one row for each year.  
Every time you insert new row WHERE year=1965, the last inserted row will overwrite the previous one. That's how The Who overwrote The Beatles.  
Fix: composite primary key (partition key = year, clustering column = artist_name)

In [None]:
# 2nd attempt: CREATE TABLE with composite primary key
query = "CREATE TABLE IF NOT EXISTS music_library2 "
query = query + "(year int, artist_name text, album_name text, city text, PRIMARY KEY(year, artist_name))"
try:
    session.execute(query)
except Exception as e:
    print(e)

# INSERT INTO
query = "INSERT INTO music_library2 (year, artist_name, album_name, city)"
query = query + " VALUES (%s, %s, %s, %s)"

try:
    session.execute(query, (1970, "The Beatles", "Let It Be", "Liverpool"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Beatles", "Rubber Soul", "Oxford"))
except Exception as e:
    print(e)
    
try:
    session.execute(query, (1965, "The Who", "My Generation", "London"))
except Exception as e:
    print(e)

try:
    session.execute(query, (1966, "The Monkees", "The Monkees", "Los Angeles"))
except Exception as e:
    print(e)

try:
    session.execute(query, (1970, "The Carpenters", "Close To You", "San Diego"))
except Exception as e:
    print(e)

# QUERY
query = "SELECT * FROM music_library2 WHERE year=1965"
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.artist_name, row.album_name, row.city)

The output should be:  
1965 The Beatles Rubber Soul Oxford  
1965 The Who My Generation London  

If the query returned the ALLOW FILTERING error, it's because of using SELECT *. Such a query may work in a smaller dataset, but is generally advised against.  
Here's the correct way to write the last query.

In [None]:
# QUERY
query = "SELECT year, artist_name, album_name, city FROM music_library2 WHERE year=1965"
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.artist_name, row.album_name, row.city)

Here's another example of a query that will definitely return the ALLOW FILTERING error.  
The reason: filter (use after WHERE) by non-key column, a column that's not part of the primary key.  

In [None]:
# Get all the albums from 1965 and London
query = "SELECT year, artist_name, album_name, city FROM music_library WHERE year=1965 AND city='London'"
try:
    rows = session.execute(query)
except Exception as e:
    print(e)
    
for row in rows:
    print (row.year, row.artist_name, row.album_name, row.city)

In [None]:
# DROP TABLES
for t in ["music_library", "music_library2"]:
    query = f"DROP TABLE {t}"
    try:
        rows = session.execute(query)
    except Exception as e:
        print(e)

# Close the connection
session.shutdown()
cluster.shutdown()