In [1]:
from pyspark.sql import SparkSession
import pandas as pd

# jupyter notebook full-width display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# pandas formatting
pd.set_option('display.float_format', '{:.2f}'.format)
# NOTE: underscore separaters ('_') are better than commas (',') because 
# numbers with underscores work in Python without any extra effort.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

In [2]:
# spark = SparkSession.builder.appName('PySparkPandas').getOrCreate()
# needs more memory
spark = SparkSession.builder.appName("PySparkPandas").config("spark.driver.memory", "32g").getOrCreate()
# spark.sparkContext.getConf().getAll()

In [3]:
# fix filetypes
from pyspark.sql.types import *

schema = StructType([
    StructField('id', StringType(), True),
    StructField('song', StringType(), True),
    StructField('artist', StringType(), True),
    StructField('acousticness', FloatType(), True),
    StructField('danceability', FloatType(), True),
    StructField('duration_ms', IntegerType(), True),
    StructField('energy', FloatType(), True),
    StructField('instrumentalness', FloatType(), True),
    StructField('key', IntegerType(), True),
    StructField('liveness', FloatType(), True),
    StructField('loudness', FloatType(), True),
    StructField('mode', IntegerType(), True),
    StructField('speechiness', FloatType(), True),
    StructField('tempo', FloatType(), True),
    StructField('time_signature', IntegerType(), True),
    StructField('valence', FloatType(), True)
])

# this is not importing correctly - it is utf-8, not sure why i can't get it to work...
# https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrameReader.csv.html
df_pyspark = spark.read.csv('all_audio_features_sql.csv', header=True, schema=schema, encoding='utf-8')
df_pyspark.printSchema()

root
 |-- id: string (nullable = true)
 |-- song: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- danceability: float (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- liveness: float (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- valence: float (nullable = true)



In [4]:
df_pandas = pd.read_csv('all_audio_features_sql.csv')

In [5]:
df_pandas.dtypes

id                   object
name                 object
name:1               object
acousticness        float64
danceability        float64
duration              int64
energy              float64
instrumentalness    float64
key                   int64
liveness            float64
loudness            float64
mode                  int64
speechiness         float64
tempo               float64
time_signature        int64
valence             float64
dtype: object

In [6]:
df_pandas.head(1)

Unnamed: 0,id,name,name:1,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,1dizvxctg9dHEyaYTFufVi,Nancy Fletcher,Gz And Hustlas (feat. Nancy Fletcher),0.16,0.65,275893,0.81,0.0,1,0.36,-4.9,1,0.31,91.89,4,0.79


### COMPARISONS

In [7]:
%%time
df_pyspark.count()

Wall time: 4.3 s


11839780

In [8]:
%%time
df_pandas.count()[0]

Wall time: 1.31 s


11839780

In [9]:
%%time
df_pyspark.filter('danceability>0.1').agg({'loudness':'count'}).show()
# df_pyspark.filter('danceability>0.1').agg({'loudness':'mean'}).show()

+---------------+
|count(loudness)|
+---------------+
|       11628325|
+---------------+

Wall time: 5.86 s


In [10]:
%%time
df_pandas[df_pandas.danceability>0.1].loudness.count()
# df_pandas[df_pandas.danceability>0.1].loudness.mean()

Wall time: 1.12 s


11669029

In [11]:
%%time
df_pyspark.filter('danceability>0.9').agg({'loudness':'count'}).show()
# df_pyspark.filter('danceability>0.9').agg({'loudness':'mean'}).show()

+---------------+
|count(loudness)|
+---------------+
|         156955|
+---------------+

Wall time: 4.07 s


In [12]:
%%time
df_pandas[df_pandas.danceability>0.9].loudness.count()
# df_pandas[df_pandas.danceability>0.9].loudness.mean()

Wall time: 80.3 ms


98574

In [13]:
%%time
df_pyspark.describe()

Wall time: 1min 54s


DataFrame[summary: string, id: string, song: string, artist: string, acousticness: string, danceability: string, duration_ms: string, energy: string, instrumentalness: string, key: string, liveness: string, loudness: string, mode: string, speechiness: string, tempo: string, time_signature: string, valence: string]

In [14]:
%%time
df_pyspark.describe().toPandas()
# the data is completely mangled

Wall time: 1min 48s


Unnamed: 0,summary,id,song,artist,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,count,11839780,11839780,11835826,11727876.0,11797075.0,11728106.0,11828702.0,11832811.0,11746123.0,11837357.0,11838067.0,11739238.0,11838973.0,11839328.0,11739161.0,11839654.0
1,mean,,,,0.4736658799489669,0.5182978325806825,242308.7797768881,1412.609524746616,457.2130004621424,178.0146751400441,83.22883839299617,42.32047341108817,38.22132535348546,15.86723746672765,127.043354862941,10.180670577735496,7.576268620775127
2,stddev,,,,0.4873683644301125,0.2017196549585961,164951.64868950774,23770.52914833623,13065.770014763097,8116.976337836692,5649.234172238063,4203.281188593474,3730.9494677455714,2646.6503381744,1943.3144906716352,1418.0251567283788,1578.6425297615417
3,min,0000QBRGPosiFRXKmMYnsO,!!!,"""""Cachaito"""" López Y """"Guajiro"""" Mirabal De B...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,-37.0,-49.195,-49.33,-34.0,-35.25
4,max,7zzzHZ2sGSdBizrykHrWtd,ｄｅｔｒｏｉｔ７,��迌,999.0,6.0,19672058.0,3610500.0,2180000.0,1659533.0,1857000.0,1177160.0,1392693.0,1590733.0,1248173.0,1089066.0,904893.0


In [15]:
%%time
df_pandas.describe()

Wall time: 3.73 s


Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0,11839780.0
mean,0.48,0.51,242287.31,0.5,0.27,5.23,0.21,-12.01,0.66,0.1,116.69,3.82,0.45
std,0.39,0.2,165215.34,0.3,0.37,3.53,0.18,7.09,0.47,0.15,31.18,0.6,0.28
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0
25%,0.05,0.37,167333.0,0.23,0.0,2.0,0.1,-15.98,0.0,0.04,92.94,4.0,0.2
50%,0.46,0.53,217506.0,0.52,0.0,5.0,0.13,-9.99,1.0,0.05,116.47,4.0,0.44
75%,0.9,0.67,280973.0,0.76,0.69,8.0,0.26,-6.68,1.0,0.08,135.61,4.0,0.69
max,1.0,1.0,19672058.0,1.0,1.0,11.0,1.0,6.28,1.0,0.97,249.99,5.0,1.0


### CONCLUSIONS:

* Pandas is (way) faster on this computer without any multithreading / distributed processing
* They give different answers for some reason...
    * PySpark isn't importing correctly even though Pandas can, and both default to utf-8 enconding (the enconding for the csv)
    * converting the pandas dataframe to pyspark worked even worse
        * super slow, memory error without changing config, errors 100% of the time (even .head())

### "De"-Bugging
more like bug watching

In [16]:
df_pyspark.filter('danceability>0.99').agg({'loudness':'count'}).show()

+---------------+
|count(loudness)|
+---------------+
|           6791|
+---------------+



In [17]:
df_pandas[df_pandas.danceability>0.99].loudness.count()

110

In [18]:
df_pandas[df_pandas.danceability>=0.99].loudness.count()

110

In [19]:
df_pyspark99 = df_pyspark.filter(df_pyspark.danceability > 0.99)

In [20]:
df_pyspark99.count()

6791

In [21]:
df_pyspark99_pandas = df_pyspark99.toPandas()

In [22]:
list_spark99 = df_pyspark99_pandas.id.tolist()

In [23]:
df_pandas[df_pandas.id.isin(list_spark99)].describe()

Unnamed: 0,acousticness,danceability,duration,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
count,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0,6797.0
mean,0.98,0.35,203641.71,0.11,0.48,5.07,0.15,-22.49,0.73,0.06,102.33,3.65,0.26
std,0.11,0.16,162987.49,0.1,0.41,3.3,0.12,5.7,0.44,0.05,31.12,0.82,0.23
min,0.0,0.0,10453.0,0.0,0.0,0.0,0.03,-47.23,0.0,0.0,0.0,0.0,0.0
25%,0.99,0.25,87777.0,0.04,0.01,2.0,0.09,-26.36,0.0,0.04,77.15,3.0,0.08
50%,0.99,0.33,169426.0,0.09,0.55,5.0,0.12,-22.21,1.0,0.05,95.07,4.0,0.18
75%,0.99,0.43,268493.0,0.16,0.9,8.0,0.16,-18.45,1.0,0.06,125.12,4.0,0.38
max,1.0,1.0,1412613.0,0.93,0.99,11.0,0.94,-3.79,1.0,0.91,208.4,5.0,1.0
