# Desafio 1
## Install pyspark

In [133]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


## Initialize spark session

In [134]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Desafio_1').getOrCreate()

## Load dataframe

In [135]:
df = spark.read.csv("content/spotify.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- Unnamed: 0: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)



## Select interest columns

In [136]:
selected_columns = [
    'track_id', 
    'artists', 
    'track_name', 
    'popularity', 
    'duration_ms',
    'explicit', 
    'danceability', 
    'energy', 
    'loudness', 
    'speechiness',
    'acousticness',
    'liveness',
    'valence',
    'track_genre'
    ]

f_df = df.select(*selected_columns)
f_df.printSchema()

root
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- track_genre: string (nullable = true)



## Clean dataframe

### Inconsistent values

#### Popularity

In [137]:
from pyspark.sql.functions import col, asc

f_df.select('popularity').where(~col('popularity').rlike('^[0-9]+$')).show(df.count(), truncate=False) # Non number popularity


+------------------------------------------------------+
|popularity                                            |
+------------------------------------------------------+
| Pt. 1) [Music from the Original TV Series]"          |
| Adagio ""Nimrod"" (Arr. J. Meisl for String Quartet)"|
| Op. 46"                                              |
| Op. 310"                                             |
| Niedermeier & Whitehead                              |
| wann wird die Zeit erscheinen?"""                    |
| frohlocket"""                                        |
| wann wird die Zeit erscheinen?"""                    |
| Op. 2: Coda. Alla Polacca - Live"                    |
| o starker König"""                                   |
| frohlocket"""                                        |
| o starker König"""                                   |
| frohlocket"""                                        |
| Op.52 No.6                                           |
| Op.52 No.6                   

#### Track genre

In [138]:

f_df.select('track_genre').where(~col('track_genre').rlike('^[a-zA-Z\- ]+$')).show(df.count(), truncate=False) # Number genre


+-----------+
|track_genre|
+-----------+
|4          |
|4          |
|4          |
|4          |
|0.576      |
|4          |
|3          |
|4          |
|4          |
|4          |
|3          |
|4          |
|3          |
|68.958     |
|68.958     |
|3          |
|4          |
|4          |
|74.077     |
|3          |
|125.262    |
|4          |
|117.11     |
|4          |
|148.759    |
|4          |
|4          |
|4          |
|4          |
|4          |
|4          |
|4          |
|105.188    |
|151.539    |
|4          |
|3          |
|4          |
|4          |
|148.759    |
|4          |
|4          |
|114.211    |
|4          |
|76.791     |
|68.453     |
|4          |
|4          |
|4          |
|3          |
|3          |
|4          |
|4          |
|3          |
|3          |
|4          |
|4          |
|89.01      |
|4          |
|4          |
|3          |
|5          |
|3          |
|1          |
|5          |
|5          |
|5          |
|1          |
|4          |
|3    

#### Acousticness

In [139]:
f_df.select('acousticness').where(~col('acousticness').rlike('^[0-9.\-e]+$')).show()

+------------+
|acousticness|
+------------+
|  Amonasro)"|
|       False|
|    Popolo)"|
+------------+



#### Energy

In [140]:
f_df.select('energy').where(~col('energy').rlike('^[0-9.\-e]+$')).show()

+--------------------+
|              energy|
+--------------------+
|Hearts And Soul -...|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|               False|
|                Aida|
|              Coro)"|
|               False|
|               False|
|               False|
+--------------------+
only showing top 20 rows



### Filter inconsistent values

In [141]:

f_df = (
    f_df
    .withColumn('popularity', col('popularity').cast('int'))
    .withColumn('duration_ms', col('duration_ms').cast('int'))
    .withColumn('danceability', col('danceability').cast('double'))
    .withColumn('energy', col('energy').cast('double'))
    .withColumn('loudness', col('loudness').cast('double'))
    .withColumn('speechiness', col('speechiness').cast('double'))
    .withColumn('acousticness', col('acousticness').cast('double'))
    .withColumn('liveness', col('liveness').cast('double'))
    .withColumn('valence', col('valence').cast('double'))
    .where(~col('track_genre').rlike('^[a-zA-Z\- ]+$'))
)

f_df.printSchema()



root
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- track_genre: string (nullable = true)



## Write filtered dataframe to file

In [149]:
f_df.coalesce(1).write.csv('content/f_spotify', header=True, mode='overwrite')