In [74]:
import numpy as np
import cassandra 
import pandas as pd
from cassandra.cluster import Cluster
import json
import glob
import csv
import math


event_file_paths = glob.glob('./event_data/*.csv')
print(len(event_file_paths))
event_file_paths[:3]

30


['./event_data/2018-11-26-events.csv',
 './event_data/2018-11-25-events.csv',
 './event_data/2018-11-29-events.csv']

In [75]:
sample_event = pd.read_csv(event_file_paths[0])
sample_event.head(3)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
0,Muse,Logged In,Jordan,F,3,Hicks,259.26485,free,"Salinas, CA",PUT,NextSong,1540010000000.0,814,Supermassive Black Hole [Phones Control Voltag...,200,1543190000000.0,37.0
1,,Logged In,Jordan,F,4,Hicks,,free,"Salinas, CA",PUT,Logout,1540010000000.0,814,,307,1543190000000.0,37.0
2,,Logged Out,,,5,,,free,,GET,Home,,814,,200,1543190000000.0,


In [76]:
all_data = []
columns = pd.read_csv(event_file_paths[0]).columns.tolist()
for path in event_file_paths:
    df = pd.read_csv(path)
    all_data += [row.values.tolist() for index, row in df.iterrows()]

print(len(all_data))
print(columns)
for i in range(3):
    print(all_data[i])

8056
['artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName', 'length', 'level', 'location', 'method', 'page', 'registration', 'sessionId', 'song', 'status', 'ts', 'userId']
['Muse', 'Logged In', 'Jordan', 'F', 3, 'Hicks', 259.26485, 'free', 'Salinas, CA', 'PUT', 'NextSong', 1540010000000.0, 814, 'Supermassive Black Hole [Phones Control Voltage Remix]', 200, 1543190000000.0, 37.0]
[nan, 'Logged In', 'Jordan', 'F', 4, 'Hicks', nan, 'free', 'Salinas, CA', 'PUT', 'Logout', 1540010000000.0, 814, nan, 307, 1543190000000.0, 37.0]
[nan, 'Logged Out', nan, nan, 5, nan, nan, 'free', nan, 'GET', 'Home', nan, 814, nan, 200, 1543190000000.0, nan]


## All_data csv file

In [77]:
with open('all_data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(columns)
    for row in all_data:
        if not isinstance(row[0], str) and math.isnan(row[0]):
            continue
        writer.writerow(row)

In [78]:
df = pd.read_csv('all_data.csv')
print(len(df))
df.head(3)

6820


Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userId
0,Muse,Logged In,Jordan,F,3,Hicks,259.26485,free,"Salinas, CA",PUT,NextSong,1540010000000.0,814,Supermassive Black Hole [Phones Control Voltag...,200,1543190000000.0,37.0
1,Mobin Master Feat. Robin S.,Logged In,Jayden,M,1,Fox,428.64281,free,"New Orleans-Metairie, LA",PUT,NextSong,1541030000000.0,842,Show Me Love,200,1543190000000.0,101.0
2,Duncan Dhu,Logged In,Bronson,M,0,Harris,211.90485,free,"Eugene, OR",PUT,NextSong,1540910000000.0,827,Rozando La Eternidad,200,1543190000000.0,33.0


In [87]:
def create_connection():
    try:
        cluster = Cluster(['localhost'])
        session = cluster.connect()
        session.execute('DROP KEYSPACE IF EXISTS sparkifydb')
        session.execute("""CREATE KEYSPACE IF NOT EXISTS sparkifydb
                        WITH REPLICATION = {
                            'class': 'SimpleStrategy',
                            'replication_factor': 1
                        }
                        """)
        session.execute("USE sparkifydb;")
        return cluster, session
        
    except Exception as e:
        print(e)



In [80]:
from cassandra.cqlengine import columns
from cassandra.cqlengine.models import Model
from cassandra.cqlengine.management import sync_table

class session_iis(Model):
    artist = columns.Text()
    song_title = columns.Text()
    length = columns.Float()
    session_id = columns.Integer(partition_key=True)
    item_in_session = columns.SmallInt(partition_key=True)


class session_user(Model):
    artist = columns.Text()
    song_title = columns.Text()
    first_name = columns.Text()
    last_name = columns.Text()
    user_id = columns.Float(partition_key=True)
    session_id = columns.Integer(partition_key=True)
    item_in_session = columns.SmallInt(primary_key=True)


class listened_to(Model):
    first_name = columns.Text(partition_key=True)
    last_name = columns.Text(partition_key=True)
    song_title = columns.Text(partition_key=True)


# sync_table(session_iis)
# sync_table(session_user)
# sync_table(listened_to)

In [101]:
drop_table_session_iis = "DROP TABLE IF EXISTS session_iis"
drop_table_session_user = "DROP TABLE IF EXISTS session_user"
drop_table_listened_to = "DROP TABLE IF EXISTS listened_to"

create_table_session_iis = """
CREATE TABLE IF NOT EXISTS session_iis (
    artist TEXT,
    song_title TEXT,
    length FLOAT,
    session_id INT,
    item_in_session INT,
    PRIMARY KEY ((session_id, item_in_session))
);
"""

create_table_session_user = """
CREATE TABLE IF NOT EXISTS session_user (
    artist TEXT,
    song_title TEXT,
    first_name TEXT,
    last_name TEXT,
    user_id FLOAT,
    session_id INT,
    item_in_session SMALLINT,
    PRIMARY KEY ((user_id, session_id), item_in_session)
);
"""

create_table_listened_to = """
CREATE TABLE IF NOT EXISTS listened_to (
    first_name TEXT,
    last_name TEXT,
    song_title TEXT,
    PRIMARY KEY (song_title, first_name, last_name)
);
"""

create_queries = [
    create_table_session_iis,
    create_table_session_user,
    create_table_listened_to
]

drop_queries = [
    drop_table_session_iis,
    drop_table_session_user,
    drop_table_listened_to
]

table_names = ['session_iis', 'session_user', 'listened_to']


def create_new_tables():
    for i, (drop_query, create_query) in enumerate(zip(drop_queries, create_queries)):
        session.execute(drop_query)
        session.execute(create_query)
        print('CREAT TABLE', table_names[i])

In [82]:
insert_session_iis = """
INSERT INTO session_iis
(artist, song_title, length, session_id, item_in_session)
VALUES
(%s, %s, %s, %s, %s)
"""

insert_session_user = """
INSERT INTO session_user
(artist, song_title, first_name, last_name, user_id, session_id, item_in_session)
VALUES
(%s, %s, %s, %s, %s, %s, %s)
"""

insert_listened_to = """
INSERT INTO listened_to
(first_name, last_name, song_title)
VALUES
(%s, %s, %s)
"""

In [83]:
def insert_all_data():
    all_data = pd.read_csv('all_data.csv')

    for index, row in all_data.iterrows():
        session_iis_data = row[['artist', 'song', 'length', 'sessionId', 'itemInSession']].values.tolist()
        session_user_data = row[['artist', 'song', 'firstName', 'lastName', 'userId', 'sessionId', 'itemInSession']].values.tolist()
        listened_to_data = row[['firstName', 'lastName', 'song']].values.tolist()

        session.execute(insert_session_iis, session_iis_data)
        session.execute(insert_session_user, session_user_data)
        session.execute(insert_listened_to, listened_to_data)

In [102]:
def test_data():
    res = session.execute("SELECT COUNT(*) FROM session_iis")
    for row in res:
        print('session_iis')
        print(row.count)

    res = session.execute("SELECT artist, song_title, length FROM \
        session_iis WHERE session_id = 338 AND item_in_session = 4")
    for row in res:
        print(row.artist, '|', row.song_title, '|', row.length)

    print()

    res = session.execute('SELECT COUNT(*) FROM session_user')
    for row in res:
        print('session_user')
        print(row.count)

    res = session.execute("SELECT artist, song_title, first_name, last_name FROM \
        session_user WHERE user_id = 10 AND session_id = 182")
    for row in res:
        print(row.artist, '|', row.song_title, '|', row.first_name, '|', \
            row.last_name)

    print()

    res = session.execute('SELECT COUNT(*) FROM listened_to')
    for row in res:
        print('listened_to')
        print(row.count)

    res = session.execute("SELECT first_name, last_name FROM listened_to \
        WHERE song_title = 'All Hands Against His Own'")
    for row in res:
        print(row.first_name, '|', row.last_name)

In [84]:
def close_connection():
    session.shutdown()
    cluster.shutdown()

In [103]:
cluster, session = create_connection()
create_new_tables()

CREAT TABLE session_iis
CREAT TABLE session_user
CREAT TABLE listened_to


In [104]:
insert_all_data()

In [105]:
test_data()

session_iis
6820
Faithless | Music Matters (Mark Knight Dub) | 495.30731201171875

session_user
6820
Down To The Bone | Keep On Keepin' On | Sylvie | Cruz
Three Drives | Greece 2000 | Sylvie | Cruz
Sebastien Tellier | Kilometer | Sylvie | Cruz
Lonnie Gordon | Catch You Baby (Steve Pitron & Max Sanna Radio Edit) | Sylvie | Cruz

listened_to
6618
Jacqueline | Lynch
Sara | Johnson
Tegan | Levine


In [106]:
def drop_tables():
    for query in drop_queries:
        session.execute(query)

drop_tables()

In [107]:
close_connection()