### Configuration Spark

In [56]:
import pyspark
pyspark.__version__

'3.3.0'

In [57]:
from pyspark.sql import SparkSession, functions as f
from pyspark import SparkFiles
import plotly.express as px

In [58]:
# create section
spark_session = SparkSession.builder.appName('PySpark Recomendation').getOrCreate()

### Loading Dataset

In [59]:
url_data = 'https://github.com/MpRonald/datasets/blob/main/dados_musicas.csv?raw=true'
spark_session.sparkContext.addFile(url_data)
path_data = 'file://' + SparkFiles.get('dados_musicas.csv')
data = spark_session.read.csv(path_data, header=True, sep=';', inferSchema=True)
data.limit(20).toPandas()

22/09/22 10:53:24 WARN SparkContext: The path https://github.com/MpRonald/datasets/blob/main/dados_musicas.csv?raw=true has been added already. Overwriting of added paths is not supported in the current version.


Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,artists_song
0,0.285,2000,0.00239,Coldplay,0.429,266773,0.661,0,3AJwUDP919kvQ9QcozQPxg,0.000121,11,0.234,-7.227,1,Yellow,84,0.0281,173.372,Coldplay - Yellow
1,0.613,2000,0.143,OutKast,0.843,270507,0.806,1,0I3q5fE6wg7LIfHGngUTnV,0.0,4,0.0771,-5.946,0,Ms. Jackson,80,0.269,94.948,OutKast - Ms. Jackson
2,0.4,2000,0.00958,Linkin Park,0.556,216880,0.864,0,60a0Rd6pjrkxjPbaKzXjfq,0.0,3,0.209,-5.87,0,In the End,84,0.0584,105.143,Linkin Park - In the End
3,0.543,2000,0.00664,3 Doors Down,0.545,233933,0.865,0,6ZOBP3NvffbU4SZcrnt1k6,1.1e-05,11,0.168,-5.708,0,Kryptonite,78,0.0286,99.009,3 Doors Down - Kryptonite
4,0.76,2000,0.0302,Eminem,0.949,284200,0.661,1,3yfqSUWxFvZELEM4PmlwIR,0.0,5,0.0454,-4.244,0,The Real Slim Shady,80,0.0572,104.504,Eminem - The Real Slim Shady
5,0.941,2000,0.000996,Disturbed,0.695,279213,0.876,1,40rvBMQizxkIqnjPdEWY1v,7e-06,3,0.106,-4.262,0,Down with the Sickness,76,0.0553,89.954,Disturbed - Down with the Sickness
6,0.722,2000,0.0616,Nelly,0.85,291782,0.7,1,3Gf5nttwcX9aaSQXRWidEZ,2e-06,7,0.244,-6.49,1,Ride Wit Me,74,0.0478,101.875,Nelly - Ride Wit Me
7,0.507,2000,0.0371,Eminem,0.78,404107,0.768,1,3UmaczJpikHgJFyBTAJVoz,2e-06,6,0.518,-4.325,0,Stan,78,0.238,80.063,Eminem - Stan
8,0.861,2000,0.031,*NSYNC,0.61,200400,0.926,0,62bOmKYxYg7dhrC6gH9vFn,0.0012,8,0.0821,-4.843,0,Bye Bye Bye,71,0.0479,172.638,*NSYNC - Bye Bye Bye
9,0.894,2000,0.3,Britney Spears,0.751,211160,0.834,0,6naxalmIoLFWR0siv8dnQQ,1.8e-05,1,0.355,-5.444,0,Oops!...I Did It Again,78,0.0437,95.053,Britney Spears - Oops!...I Did It Again


In [60]:
data.show(vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------
 valence          | 0.285                                                    
 year             | 2000                                                     
 acousticness     | 0.00239                                                  
 artists          | Coldplay                                                 
 danceability     | 0.429                                                    
 duration_ms      | 266773                                                   
 energy           | 0.6609999999999999                                       
 explicit         | 0                                                        
 id               | 3AJwUDP919kvQ9QcozQPxg                                   
 instrumentalness | 1.21E-4                                                  
 key              | 11                                                       
 liveness         | 0.234                                       

In [61]:
data.printSchema()

root
 |-- valence: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- energy: double (nullable = true)
 |-- explicit: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- liveness: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- artists_song: string (nullable = true)



In [62]:
# shape
data.count(), len(data.columns)

(20311, 19)

In [63]:
# checking null values
data.select([f.count(f.when(f.isnull(c), 1)).alias(c) for c in data.columns]).show()

+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+
|valence|year|acousticness|artists|danceability|duration_ms|energy|explicit| id|instrumentalness|key|liveness|loudness|mode|name|popularity|speechiness|tempo|artists_song|
+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+
|      0|   0|           0|      0|           0|          0|     0|       0|  0|               0|  0|       0|       0|   0|   0|         0|          0|    0|           0|
+-------+----+------------+-------+------------+-----------+------+--------+---+----------------+---+--------+--------+----+----+----------+-----------+-----+------------+



In [64]:
# checking year
print(sorted(data.select('year').distinct().collect()))

[Row(year=2000), Row(year=2001), Row(year=2002), Row(year=2003), Row(year=2004), Row(year=2005), Row(year=2006), Row(year=2007), Row(year=2008), Row(year=2009), Row(year=2010), Row(year=2011), Row(year=2012), Row(year=2013), Row(year=2014), Row(year=2015), Row(year=2016), Row(year=2017), Row(year=2018), Row(year=2019), Row(year=2020)]


                                                                                

### Correlation Matrix

In [65]:
url_data_years = 'https://github.com/MpRonald/datasets/raw/main/data_by_year.csv'
spark_session.sparkContext.addFile(url_data_years)
path_data_years = 'file://' + SparkFiles.get('data_by_year.csv')
data_year = spark_session.read.csv(path_data_years, header=True, sep=',', inferSchema=True)
data_year.limit(10).toPandas()

22/09/22 10:53:30 WARN SparkContext: The path https://github.com/MpRonald/datasets/raw/main/data_by_year.csv has been added already. Overwriting of added paths is not supported in the current version.


Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5
5,1,1926,0.660817,0.59988,156881.657475,0.211467,0.333093,0.23237,-18.492538,0.483704,109.648033,0.43691,1.422351,9
6,1,1927,0.936179,0.648268,184993.598374,0.264321,0.391328,0.16845,-14.422374,0.11361,114.846524,0.6597,0.801626,7
7,1,1928,0.938617,0.534288,214827.906423,0.207948,0.494835,0.175289,-17.191983,0.159911,106.772262,0.495713,1.525773,1
8,1,1929,0.601427,0.64767,168999.412815,0.241801,0.215204,0.236,-16.530376,0.490001,110.948357,0.63653,0.340336,7
9,1,1930,0.936715,0.518176,195150.285343,0.333524,0.352206,0.221311,-12.869221,0.11991,109.871194,0.616238,0.926715,2


In [66]:
data_year_2000 = data_year.filter('year >= 2000')
data_year_2000.limit(10).toPandas()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,2000,0.289323,0.590918,242724.642638,0.625413,0.101168,0.197686,-8.247766,0.089205,118.999323,0.559475,46.684049,7
1,1,2001,0.286842,0.583318,240307.79601,0.626986,0.107214,0.187026,-8.305095,0.089182,117.765399,0.541479,48.750125,7
2,1,2002,0.282624,0.57616,239503.283,0.64127,0.088048,0.193911,-7.68664,0.084308,119.239738,0.542397,48.6555,7
3,1,2003,0.256471,0.575763,244670.57523,0.660165,0.083049,0.196976,-7.485545,0.093926,120.914622,0.530504,48.626407,7
4,1,2004,0.280559,0.56768,237378.708037,0.648868,0.077934,0.202199,-7.601655,0.094239,121.290346,0.524489,49.273143,7
5,1,2005,0.255764,0.572281,237229.588205,0.653209,0.090194,0.190082,-7.466159,0.093334,121.617967,0.532531,50.953333,0
6,1,2006,0.279986,0.56823,234042.914359,0.650326,0.077701,0.188289,-7.265501,0.085847,121.798615,0.520028,51.313846,7
7,1,2007,0.254081,0.563414,241049.962564,0.668305,0.072957,0.196127,-7.044536,0.084347,124.087516,0.516794,51.075897,7
8,1,2008,0.249192,0.579193,240107.315601,0.671461,0.063662,0.198431,-6.843804,0.077356,123.509934,0.527542,50.630179,0
9,1,2009,0.261929,0.56419,238140.013265,0.670749,0.075872,0.205252,-7.046015,0.085458,123.463808,0.50717,51.440816,0


In [67]:
data_year_2000.count()

21

In [68]:
len(data_year_2000.columns)

14

In [70]:
# changing dataset toPandas
fig = px.line(data_year_2000.toPandas(), x='year', y='loudness', markers=True,
              title='Loudness variation by YEARS').show()