# Test Code for Unzipping Local Files

In [41]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from zipfile import ZipFile

import pyspark.sql.functions as F
import pyspark
from pyspark import SparkContext, SparkConf

## General

In [58]:
sc.stop()
spark.stop()

In [59]:
configure = SparkConf().setAppName('name').setMaster('local')
sc = SparkContext(conf = configure)

In [60]:
# getOrCreate() modifies existing spark session 
spark = SparkSession.builder.appName('local_name').config('config option', 'config value').getOrCreate()

## Song Data

In [61]:
with ZipFile('data/song-data.zip', 'r') as zip_ref:
    zip_ref.extractall('data/local_song_data')

In [62]:
songs = spark.read.json('data/local_song_data/song_data/*/*/*/*.json')

In [63]:
songs.printSchema()

root
 |-- artist_id: string (nullable = true)
 |-- artist_latitude: double (nullable = true)
 |-- artist_location: string (nullable = true)
 |-- artist_longitude: double (nullable = true)
 |-- artist_name: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- num_songs: long (nullable = true)
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [64]:
songs.describe().show()

+-------+------------------+------------------+---------------+------------------+-----------+------------------+---------+------------------+--------------------+-----------------+
|summary|         artist_id|   artist_latitude|artist_location|  artist_longitude|artist_name|          duration|num_songs|           song_id|               title|             year|
+-------+------------------+------------------+---------------+------------------+-----------+------------------+---------+------------------+--------------------+-----------------+
|  count|                71|                31|             71|                31|         71|                71|       71|                71|                  71|               71|
|   mean|              null| 36.55297161290323|           null|-73.25123258064517|       null|239.72967605633804|      1.0|              null|                null|785.9577464788732|
| stddev|              null|12.431023413063544|           null| 36.05807592882607|       n

In [65]:
songs_table = songs.select('song_id', 'title', 'artist_id', 'year', 'duration')
song_pd = songs_table.toPandas()

In [75]:
song_pd.shape

(71, 5)

### Partitioned songs_table

How do I transform the DataFrame to be partitioned?

In [66]:
distributed_songs = sc.parallelize(song_pd)

In [71]:
songs_partitioned = distributed_songs.partitionBy(4, ['year', 'artist_id'])

In [72]:
songs_partitioned_df = songs_partitioned.toPandas()

AttributeError: 'RDD' object has no attribute 'toPandas'

### Partitioned Table Save as Parquet File

In [47]:
out_path = 'output/songs_played.parquet'

In [26]:
# songs_table.write.save(out_path, format = 'parquet', header = True)
##<INSERT PARTIONED TABLES NAME>.write.parquet('songs_file_2.parquet')

AttributeError: 'DataFrame' object has no attribute 'parallelize'

In [None]:
sc.parallelize(nums)

## Log Data

In [2]:
with ZipFile('data/log-data.zip', 'r') as zip_ref:
    zip_ref.extractall('data/local_log_data')

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
log_df = spark.read.json('data/local_log_data')

In [5]:
log_df.printSchema()

root
 |-- artist: string (nullable = true)
 |-- auth: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- itemInSession: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- length: double (nullable = true)
 |-- level: string (nullable = true)
 |-- location: string (nullable = true)
 |-- method: string (nullable = true)
 |-- page: string (nullable = true)
 |-- registration: double (nullable = true)
 |-- sessionId: long (nullable = true)
 |-- song: string (nullable = true)
 |-- status: long (nullable = true)
 |-- ts: long (nullable = true)
 |-- userAgent: string (nullable = true)
 |-- userId: string (nullable = true)



In [6]:
log_df.describe().show()

+-------+------------------+----------+---------+------+------------------+--------+------------------+-----+--------------------+------+-------+--------------------+------------------+--------------------+------------------+--------------------+--------------------+-----------------+
|summary|            artist|      auth|firstName|gender|     itemInSession|lastName|            length|level|            location|method|   page|        registration|         sessionId|                song|            status|                  ts|           userAgent|           userId|
+-------+------------------+----------+---------+------+------------------+--------+------------------+-----+--------------------+------+-------+--------------------+------------------+--------------------+------------------+--------------------+--------------------+-----------------+
|  count|              6820|      8056|     7770|  7770|              8056|    7770|              6820| 8056|                7770|  8056|   80

NameError: name 'songs' is not defined