In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
# Locate the CSV file to be read.
spotify_data_path = "Resources/data.csv"

# Read the Spotify songs data.
spotify_data = pd.read_csv(spotify_data_path)

# Tempo

In [3]:
# Create a DataFrame with only the "popularity" and "tempo" columns.
tempo_df = spotify_data[["popularity", "tempo"]]
tempo_df.head()

Unnamed: 0,popularity,tempo
0,4,80.954
1,5,60.936
2,5,110.339
3,3,100.109
4,2,101.665


In [4]:
# Establish the bins.
popularity_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ["Least Popular (1)", "2", "3", "4", "5", "6", "7", "8", "9", "Most Popular (10)"]

In [5]:
# Use "pd.cut" to categorize popularity based on the bins.
tempo_df["Popularity Bin"] = pd.cut(tempo_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
tempo_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,tempo,Popularity Bin
0,4,80.954,Least Popular (1)
1,5,60.936,Least Popular (1)
2,5,110.339,Least Popular (1)
3,3,100.109,Least Popular (1)
4,2,101.665,Least Popular (1)


In [6]:
# Group by "Popularity Bin" and find the mean tempo for each one.
tempo_means = pd.DataFrame(tempo_df.groupby(["Popularity Bin"]).mean()["tempo"])
tempo_means

Unnamed: 0_level_0,tempo
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),109.809226
2,113.385109
3,118.369305
4,119.354498
5,120.338871
6,120.172103
7,121.074647
8,120.72036
9,121.948016
Most Popular (10),121.187795


# Speechiness

In [7]:
# Create a DataFrame with only the "popularity" and "speechiness" columns.
speechiness_df = spotify_data[["popularity", "speechiness"]]
speechiness_df.head()

Unnamed: 0,popularity,speechiness
0,4,0.0366
1,5,0.415
2,5,0.0339
3,3,0.0354
4,2,0.038


In [8]:
# Use "pd.cut" to categorize popularity based on the bins.
speechiness_df["Popularity Bin"] = pd.cut(speechiness_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
speechiness_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,speechiness,Popularity Bin
0,4,0.0366,Least Popular (1)
1,5,0.415,Least Popular (1)
2,5,0.0339,Least Popular (1)
3,3,0.0354,Least Popular (1)
4,2,0.038,Least Popular (1)


In [9]:
# Group by "Popularity Bin" and find the mean speechiness for each one.
speechiness_means = pd.DataFrame(speechiness_df.groupby(["Popularity Bin"]).mean()["speechiness"])
speechiness_means

Unnamed: 0_level_0,speechiness
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),0.167179
2,0.080372
3,0.063355
4,0.069378
5,0.077383
6,0.08021
7,0.093281
8,0.10128
9,0.102342
Most Popular (10),0.121782


# Loudness

In [10]:
# Create a DataFrame with only the "popularity" and "loudness" columns.
loudness_df = spotify_data[["popularity", "loudness"]]
loudness_df.head()

Unnamed: 0,popularity,loudness
0,4,-20.096
1,5,-12.441
2,5,-14.85
3,3,-9.316
4,2,-10.096


In [11]:
# Use "pd.cut" to categorize popularity based on the bins.
loudness_df["Popularity Bin"] = pd.cut(loudness_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
loudness_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,loudness,Popularity Bin
0,4,-20.096,Least Popular (1)
1,5,-12.441,Least Popular (1)
2,5,-14.85,Least Popular (1)
3,3,-9.316,Least Popular (1)
4,2,-10.096,Least Popular (1)


In [12]:
# Group by "Popularity Bin" and find the mean loudness for each one.
loudness_means = pd.DataFrame(loudness_df.groupby(["Popularity Bin"]).mean()["loudness"])
loudness_means

Unnamed: 0_level_0,loudness
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),-15.017333
2,-13.693482
3,-12.162322
4,-11.060683
5,-9.439159
6,-8.482017
7,-7.871956
8,-6.844695
9,-6.59371
Most Popular (10),-5.846


# Instrumentalness

In [13]:
# Create a DataFrame with only the "popularity" and "instrumentalness" columns.
instrumentalness_df = spotify_data[["popularity", "instrumentalness"]]
instrumentalness_df.head()

Unnamed: 0,popularity,instrumentalness
0,4,0.878
1,5,0.0
2,5,0.913
3,3,2.8e-05
4,2,2e-06


In [14]:
# Use "pd.cut" to categorize popularity based on the bins.
instrumentalness_df["Popularity Bin"] = pd.cut(instrumentalness_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
instrumentalness_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,instrumentalness,Popularity Bin
0,4,0.878,Least Popular (1)
1,5,0.0,Least Popular (1)
2,5,0.913,Least Popular (1)
3,3,2.8e-05,Least Popular (1)
4,2,2e-06,Least Popular (1)


In [15]:
# Group by "Popularity Bin" and find the mean instrumentalness for each one.
instrumentalness_means = pd.DataFrame(instrumentalness_df.groupby(["Popularity Bin"]).mean()["instrumentalness"])
instrumentalness_means

Unnamed: 0_level_0,instrumentalness
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),0.32492
2,0.199388
3,0.144451
4,0.126959
5,0.106812
6,0.081055
7,0.058884
8,0.023805
9,0.013756
Most Popular (10),0.003519


# Acousticness

In [16]:
# Create a DataFrame with only the "popularity" and "acousticness" columns.
acousticness_df = spotify_data[["popularity", "acousticness"]]
acousticness_df.head()

Unnamed: 0,popularity,acousticness
0,4,0.982
1,5,0.732
2,5,0.961
3,3,0.967
4,2,0.957


In [17]:
# Use "pd.cut" to categorize popularity based on the bins.
acousticness_df["Popularity Bin"] = pd.cut(acousticness_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
acousticness_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,acousticness,Popularity Bin
0,4,0.982,Least Popular (1)
1,5,0.732,Least Popular (1)
2,5,0.961,Least Popular (1)
3,3,0.967,Least Popular (1)
4,2,0.957,Least Popular (1)


In [18]:
# Group by "Popularity Bin" and find the mean acousticness for each one.
acousticness_means = pd.DataFrame(acousticness_df.groupby(["Popularity Bin"]).mean()["acousticness"])
acousticness_means

Unnamed: 0_level_0,acousticness
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),0.845349
2,0.724277
3,0.478301
4,0.379986
5,0.325177
6,0.291735
7,0.272957
8,0.237937
9,0.251331
Most Popular (10),0.240312


# Key

In [19]:
# Create a DataFrame with only the "popularity" and "key" columns.
key_df = spotify_data[["popularity", "key"]]
key_df.head()

Unnamed: 0,popularity,key
0,4,10
1,5,7
2,5,3
3,3,5
4,2,3


In [20]:
# Use "pd.cut" to categorize popularity based on the bins.
key_df["Popularity Bin"] = pd.cut(key_df["popularity"], popularity_bins, labels=labels, include_lowest=True)
key_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,popularity,key,Popularity Bin
0,4,10,Least Popular (1)
1,5,7,Least Popular (1)
2,5,3,Least Popular (1)
3,3,5,Least Popular (1)
4,2,3,Least Popular (1)


In [23]:
# Group by "Popularity Bin" and find the mean key for each one.
key_means = pd.DataFrame(key_df.groupby(["Popularity Bin"]).mean()["key"])
key_means

Unnamed: 0_level_0,key
Popularity Bin,Unnamed: 1_level_1
Least Popular (1),5.185133
2,5.083505
3,5.136957
4,5.226351
5,5.255078
6,5.244874
7,5.201031
8,5.261572
9,5.319608
Most Popular (10),4.794872


# Summary Tables

In [26]:
# Combine the three variables that seem statistically significant into a summary table with their mean scores by popularity.
df_1 = pd.merge(instrumentalness_means, loudness_means, how="left", on=["Popularity Bin"])
summary_df = pd.merge(df_1, acousticness_means, how="left", on=["Popularity Bin"])
summary_df

Unnamed: 0_level_0,instrumentalness,loudness,acousticness
Popularity Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Least Popular (1),0.32492,-15.017333,0.845349
2,0.199388,-13.693482,0.724277
3,0.144451,-12.162322,0.478301
4,0.126959,-11.060683,0.379986
5,0.106812,-9.439159,0.325177
6,0.081055,-8.482017,0.291735
7,0.058884,-7.871956,0.272957
8,0.023805,-6.844695,0.237937
9,0.013756,-6.59371,0.251331
Most Popular (10),0.003519,-5.846,0.240312
