In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [2]:
from pyspark.sql.functions import when

In [3]:
spark = SparkSession.builder \
    .appName("CrearDataFrame") \
    .master("local") \
    .getOrCreate()

In [4]:
df = spark.read.csv("C:/Dropbox/CursoSpark/spotify.csv", header=True, sep=",")

In [5]:
df.show()

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|_c0|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|  0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   False|       0.676| 0.461|  1|  -6.746|   0|      0.143|      0.0322|        1.01e-06|   0.358|  0.715| 87.917|            

In [6]:
df = df.na.drop()

In [7]:
df.columns

['_c0',
 'track_id',
 'artists',
 'album_name',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre']

In [8]:
df = df.dropDuplicates()

In [9]:
num_filas = df.count()
num_columnas = len(df.columns)
print("Numero de filas:", num_filas)
print("Numero de columnas:", num_columnas)

Numero de filas: 113999
Numero de columnas: 21


In [10]:
df = df.drop('track_id')

In [11]:
df.show(3)

+----+-------------+--------------------+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| _c0|      artists|          album_name|      track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+----+-------------+--------------------+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| 539| Boyce Avenue|Cover Sessions, V...|Someone Like You|        57|     276146|   False|       0.439| 0.207|  1|  -9.573|   1|     0.0297|       0.608|             0.0|   0.186|  0.264|136.514|             4|   acoustic|
| 831|Little Walter|The Essential Lit...|         My Babe|        59|     162040|   False|       0.676| 0.33

In [12]:
df = df.withColumn('explicit', when(df.explicit == True, 1).otherwise(0))

In [13]:
df.show()

+----+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| _c0|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+----+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| 539|        Boyce Avenue|Cover Sessions, V...|    Someone Like You|        57|     276146|       0|       0.439| 0.207|  1|  -9.573|   1|     0.0297|       0.608|             0.0|   0.186|  0.264|136.514|             4|   acoustic|
| 831|       Little Walter|The Essential Lit...|             My 

In [14]:
from pyspark.sql.functions import col

# Supongamos que 'columna' es el nombre de la columna que deseas convertir
nombre_columna = 'popularity'

# Convertir la columna a tipo entero y filtrar las filas inválidas
df = df.withColumn(nombre_columna, col(nombre_columna).cast('int')) \
       .filter(col(nombre_columna).isNotNull())

# Mostrar el DataFrame resultante
df.show()


+----+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| _c0|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+----+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| 539|        Boyce Avenue|Cover Sessions, V...|    Someone Like You|        57|     276146|       0|       0.439| 0.207|  1|  -9.573|   1|     0.0297|       0.608|             0.0|   0.186|  0.264|136.514|             4|   acoustic|
| 831|       Little Walter|The Essential Lit...|             My 

In [15]:
num_filas = df.count()
num_columnas = len(df.columns)
print("Numero de filas:", num_filas)
print("Numero de columnas:", num_columnas)

Numero de filas: 113865
Numero de columnas: 20


In [16]:
df.select('track_genre').distinct().show(n= df.count(), truncate= False)

+-----------------+
|track_genre      |
+-----------------+
|anime            |
|singer-songwriter|
|folk             |
|hardstyle        |
|pop              |
|alternative      |
|death-metal      |
|detroit-techno   |
|idm              |
|k-pop            |
|j-dance          |
|ambient          |
|guitar           |
|goth             |
|cantopop         |
|blues            |
|study            |
|malay            |
|breakbeat        |
|dance            |
|groove           |
|indian           |
|german           |
|sad              |
|spanish          |
|french           |
|electronic       |
|brazil           |
|dub              |
|deep-house       |
|edm              |
|rock-n-roll      |
|power-pop        |
|progressive-house|
|swedish          |
|synth-pop        |
|chill            |
|bluegrass        |
|j-rock           |
|party            |
|hip-hop          |
|reggaeton        |
|techno           |
|grunge           |
|hard-rock        |
|indie-pop        |
|jazz             |


In [17]:
genero_numero = {
    "anime": 0,
    "singer-songwriter": 1,
    "folk": 2,
    "hardstyle": 3,
    "pop": 4,
    "alternative": 5,
    "death-metal": 6,
    "detroit-techno": 7,
    "idm": 8,
    "k-pop": 9,
    "j-dance": 10,
    "ambient": 11,
    "guitar": 12,
    "goth": 13,
    "cantopop": 14,
    "blues": 15,
    "study": 16,
    "malay": 17,
    "breakbeat": 18,
    "dance": 19,
    "groove": 20,
    "indian": 21,
    "german": 22,
    "sad": 23,
    "spanish": 24,
    "french": 25,
    "electronic": 26,
    "brazil": 27,
    "dub": 28,
    "deep-house": 29,
    "edm": 30,
    "rock-n-roll": 31,
    "power-pop": 32,
    "progressive-house": 33,
    "swedish": 34,
    "synth-pop": 35,
    "chill": 36,
    "bluegrass": 37,
    "j-rock": 38,
    "party": 39,
    "hip-hop": 40,
    "reggaeton": 41,
    "techno": 42,
    "grunge": 43,
    "hard-rock": 44,
    "indie-pop": 45,
    "jazz": 46,
    "new-age": 47,
    "show-tunes": 48,
    "trip-hop": 49,
    "punk-rock": 50,
    "country": 51,
    "hardcore": 52,
    "industrial": 53,
    "british": 54,
    "metalcore": 55,
    "songwriter": 56,
    "j-idol": 57,
    "honky-tonk": 58,
    "kids": 59,
    "metal": 60,
    "soul": 61,
    "turkish": 62,
    "mpb": 63,
    "psych-rock": 64,
    "grindcore": 65,
    "pop-film": 66,
    "salsa": 67,
    "happy": 68,
    "mandopop": 69,
    "dancehall": 70,
    "r-n-b": 71,
    "club": 72,
    "indie": 73,
    "electro": 74,
    "latino": 75,
    "samba": 76,
    "drum-and-bass": 77,
    "heavy-metal": 78,
    "house": 79,
    "pagode": 80,
    "chicago-house": 81,
    "funk": 82,
    "alt-rock": 83,
    "disney": 84,
    "children": 85,
    "sleep": 86,
    "dubstep": 87,
    "gospel": 88,
    "world-music": 89,
    "acoustic": 90,
    "rockabilly": 91,
    "rock": 92,
    "ska": 93,
    "opera": 94,
    "black-metal": 95,
    "iranian": 96,
    "romance": 97,
    "emo": 98,
    "reggae": 99,
    "j-pop": 100,
    "tango": 101,
    "punk": 102,
    "disco": 103,
    "classical": 104,
    "latin": 105,
    "sertanejo": 106,
    "afrobeat": 107,
    "garage": 108,
    "forro": 109,
    "trance": 110,
    "minimal-techno": 111,
    "comedy": 112,
    "piano": 113
}


In [18]:
genero_claves = genero_numero.keys()

In [19]:
df.columns

['_c0',
 'artists',
 'album_name',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre']

In [20]:
df = df.drop('artists', 'albun_name')

In [21]:
df = df.drop('album_name')

In [22]:
filas = df.select('_c0','track_name').collect()


In [23]:
cancion_id_dict = {fila['_c0']: fila['track_name'] for fila in filas}

In [24]:
cancion_id_dict

{'539': 'Someone Like You',
 '831': 'My Babe',
 '1087': 'Makoti',
 '1090': 'Triunfo - Live',
 '1096': 'Black Organ',
 '1109': 'Bienvenidos A Mi Mundo',
 '1283': 'Feitiço',
 '1793': 'Electro Movimiento',
 '1974': 'Tropical Garden',
 '2260': "Can't Stop",
 '2363': 'Looking For Somebody (To Love)',
 '2514': 'Happy Pills',
 '2638': 'Presión',
 '2783': 'Halloween',
 '2989': 'De Paso Nomás',
 '3050': 'Little Dark Age',
 '3307': 'I Like Him',
 '3723': 'Lithium',
 '4817': 'The Narrow Path',
 '4925': 'Walk With Us - For Black Lives Matter',
 '4947': 'Tha',
 '5020': "il vento d'oro",
 '5240': 'Every Breath You Take',
 '5339': 'The Reluctant Heroes',
 '5880': 'Unholy God',
 '6363': 'Left Hand March',
 '6440': 'One With the Forest',
 '6473': 'Köld',
 '6697': 'Your War',
 '6851': 'Dronning Ellisiv',
 '6940': 'Puritania - Live',
 '7127': 'Moon Over Memphis',
 '7743': '100 Degrees in the Shade',
 '8048': 'Sweet Home Alabama',
 '8126': "I've Got My Love To Keep Me Warm",
 '8134': 'Rudolph The Red-Nose

In [25]:
df.columns

['_c0',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre']

In [26]:
df.show(2)

+---+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|_c0|      track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+----------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|539|Someone Like You|        57|     276146|       0|       0.439| 0.207|  1|  -9.573|   1|     0.0297|       0.608|             0.0|   0.186|  0.264|136.514|             4|   acoustic|
|831|         My Babe|        59|     162040|       0|       0.676| 0.334|  5| -10.572|   1|     0.0508|       0.862|             0.0|    0.32|  0.867|  158.0|             4|   acoustic|
+---+----------------+----------+-----------+--------+-----------

In [27]:
df = df.withColumn('duration_ms', col('duration_ms').cast('int')) \
    .filter(col('duration_ms').isNotNull())

In [28]:
df = df.withColumn('key', col('key').cast('int')) \
    .filter(col('key').isNotNull())

In [29]:
df = df.withColumn('mode', col('mode').cast('int')) \
    .filter(col('mode').isNotNull())

In [30]:
df = df.withColumn('time_signature', col('time_signature').cast('int')) \
    .filter(col('time_signature').isNotNull())

In [31]:
df = df.withColumn('danceability', col('danceability').cast('float')) \
    .filter(col('danceability').isNotNull())

In [32]:
df = df.withColumn('energy', col('energy').cast('float')) \
    .filter(col('energy').isNotNull())

In [33]:
df = df.withColumn('loudness', col('loudness').cast('float')) \
    .filter(col('loudness').isNotNull())

In [34]:
df = df.withColumn('speechiness', col('speechiness').cast('float')) \
    .filter(col('speechiness').isNotNull())

In [35]:
df = df.withColumn('acousticness', col('acousticness').cast('float')) \
    .filter(col('acousticness').isNotNull())

In [36]:
df = df.withColumn('instrumentalness', col('instrumentalness').cast('float')) \
    .filter(col('instrumentalness').isNotNull())

In [37]:

df = df.withColumn('liveness', col('liveness').cast('float')) \
    .filter(col('liveness').isNotNull())

In [38]:
df = df.withColumn('valence', col('valence').cast('float')) \
    .filter(col('valence').isNotNull())

In [39]:
df = df.withColumn('tempo', col('tempo').cast('float')) \
    .filter(col('tempo').isNotNull())

In [40]:
df = df.withColumn('_c0', col('_c0').cast('int')) \
    .filter(col('_c0').isNotNull())

In [41]:
df.dtypes

[('_c0', 'int'),
 ('track_name', 'string'),
 ('popularity', 'int'),
 ('duration_ms', 'int'),
 ('explicit', 'int'),
 ('danceability', 'float'),
 ('energy', 'float'),
 ('key', 'int'),
 ('loudness', 'float'),
 ('mode', 'int'),
 ('speechiness', 'float'),
 ('acousticness', 'float'),
 ('instrumentalness', 'float'),
 ('liveness', 'float'),
 ('valence', 'float'),
 ('tempo', 'float'),
 ('time_signature', 'int'),
 ('track_genre', 'string')]

In [42]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='track_genre', outputCol='class_numeric')
indexer_model = indexer.fit(df)
df_indexed = indexer_model.transform(df)

df_indexed.show()

+----+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+-------------+
| _c0|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|class_numeric|
+----+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+-------------+
| 539|    Someone Like You|        57|     276146|       0|       0.439| 0.207|  1|  -9.573|   1|     0.0297|       0.608|             0.0|   0.186|  0.264|136.514|             4|   acoustic|         97.0|
| 831|             My Babe|        59|     162040|       0|       0.676| 0.334|  5| -10.572|   1|     0.0508|       0.862|             0.0|    0.32|  0.867|  158.0|            

In [43]:
df_indexed.columns

['_c0',
 'track_name',
 'popularity',
 'duration_ms',
 'explicit',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature',
 'track_genre',
 'class_numeric']

In [1]:
import pandas as pd

In [59]:
df_pandas = pd.DataFrame(df_indexed.collect())

In [60]:
df_pandas

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,539,Someone Like You,57,276146,0,0.4390,0.207,1,-9.573000,1,0.0297,0.60800,0.000000,0.1860,0.26400,136.514008,4,acoustic,97.0
1,831,My Babe,59,162040,0,0.6760,0.334,5,-10.572000,1,0.0508,0.86200,0.000000,0.3200,0.86700,158.000000,4,acoustic,97.0
2,1087,Makoti,0,274933,0,0.4690,0.824,8,-10.785000,1,0.0944,0.47800,0.268000,0.0760,0.78100,220.080994,4,afrobeat,0.0
3,1090,Triunfo - Live,34,273426,0,0.5400,0.945,1,-5.021000,1,0.2710,0.31500,0.000000,0.4050,0.55100,92.013000,4,afrobeat,0.0
4,1096,Black Organ,34,191506,0,0.8370,0.598,1,-9.047000,0,0.1810,0.39100,0.238000,0.0598,0.98300,145.218994,4,afrobeat,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113860,112862,Gringo,38,213514,1,0.7190,0.634,10,-6.393000,0,0.0958,0.06600,0.000025,0.0918,0.37800,149.934006,4,turkish,95.0
113861,113058,The Blessing,60,423193,0,0.3670,0.463,11,-7.638000,1,0.0335,0.12100,0.000010,0.0983,0.18700,139.996002,4,world-music,96.0
113862,113432,"Estudiar Con Sonidos de Truenos, Pt. 03",28,97581,0,0.1610,0.897,10,-22.398001,0,0.0804,0.00163,0.920000,0.8630,0.02470,110.366997,4,world-music,96.0
113863,113599,Meditación Budista (Musica de Relajacion para ...,25,396033,0,0.0748,0.202,1,-16.792999,1,0.0368,0.98700,0.939000,0.1240,0.03400,66.177002,4,world-music,96.0


In [61]:
df_pandas.to_csv("C:/Dropbox/CursoSpark/pandas_spotify.csv")