In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, avg, count, desc, floor, corr
import numpy as np
import time

spark_session = SparkSession.builder\
        .master("spark://192.168.2.223:7077") \
        .appName("Song Analysis with Decade Tempo")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores",1)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
sqlContext = SQLContext(spark_session.sparkContext) 

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/05 10:52:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
start_time = time.time()

In [None]:
# Load the dataset
df = sqlContext.read.csv('hdfs://9000/user/ubuntu/MillionSongSubset_aggregated_4GB.csv',
                                       header='true', inferSchema='true').cache()

df.show()

In [None]:
# Filter out rows with non-values or values <= 0 for Year, Tempo, and Duration
df_filtered = df.filter((col("Year") > 1970) & (col("Tempo") > 0) & (col("Duration") > 0))

In [None]:
# Calculate decade and average Tempo by Decade
df_with_decade = df_filtered.withColumn("Decade", (floor(df_filtered["Year"] / 10) * 10))
df_decade_tempo = df_with_decade.groupBy("Decade").agg(
    avg("Tempo").alias("average_tempo"), 
    avg("Duration").alias("average_duration"), 
    count("*").alias("songs_count")).orderBy("Decade")

In [None]:
df_nondecade_tempo = df_with_decade.groupBy("Year").agg(
    avg("Tempo").alias("average_tempo"), 
    avg("Duration").alias("average_duration"), 
    count("*").alias("songs_count")).orderBy("Year")

df_nondecade_tempo_filtered = df_nondecade_tempo.filter(df_nondecade_tempo["songs_count"] > 5).orderBy("Year")

In [None]:
df_decade_tempo.show()

In [None]:
df_nondecade_tempo_filtered.show()

In [None]:
correlation_df = df_filtered.select(
    corr("Tempo", "Year").alias("tempo_year_corr"),
    corr("Duration", "Year").alias("duration_year_corr")
)

correlation_df.show()

In [None]:
correlation_decade_df = df_with_decade.select(
    corr("Tempo", "Decade").alias("tempo_decade_corr"),
    corr("Duration", "Decade").alias("duration_decade_corr")
)

correlation_decade_df.show()

In [None]:
print("Time taken: ", time.time() - start_time)

In [None]:
spark_session.stop()