In [None]:
#This is a Project about Spotify Music app
#firstly import all the important libraries pandas,numpy, matplotlib and seaborn.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# lets import file "tracks.csv" by using pandas function {".read_csv('XXXX.csv')"}
#then {".head()" to get first 5 rows of the dataset}.
df_tracks = pd.read_csv('tracks.csv')
df_tracks.head()

In [None]:
#lets check null values in the data set before analysis.
#To check null pandas function "pd.isnull(df_tracks).sum()"
#here ".sum" helps to get number nulls in each columns
pd.isnull(df_tracks).sum()

In [None]:
# so in the name cloumn we have 71 nulls, no nulls in the rest of columns.

#lets check the no:of rows & columns of the dataset, thier types and memory uasge.
#the function is ".info()" which provides whole info about the dataset
df_tracks.info()


In [None]:
#Lets find 10 least famous songs in Spotify
#Lets create a new algorithm and use the pandas function ".sort_values('XXXXXX', ascending = True).head(15)"
#in the code i took "ascending = True" , to get 10 least famous songs form the "popularity" column
#head(15) represents those 15 songs

least_famous_song = df_tracks.sort_values('popularity', ascending = True).head(15)
least_famous_song

In [None]:
#lets see the statstatistical values of the dataset
#the pandas funstion to get statistics ".describe()","transpose()"
df_tracks.describe().transpose()

In [None]:
#lets get 15 most popular songs from the dataset.
#Here i want 15 song which has popularity of >90, So i used pandas funstion ".query('popularity>90')"
#sort_value() to get top songs,where ascending is false

most_famous_song = df_tracks.query('popularity>90', inplace = False).sort_values('popularity', ascending = False)
most_famous_song[:15]

In [None]:
#lets set index to the dataset , My first column as "Release date".
#The pandas function is ".set_index('column_name', inplace = True)"
#my index is must be in date time format "pd.to_datetime()" function is used.
#.head() to get first 5 rows of the data set
df_tracks.set_index("release_date" , inplace = True)
df_tracks.index = pd.to_datetime(df_tracks.index)
df_tracks.head()

In [None]:
#If we want to check 39th artist of the dataset.
#here I took artists column in [] brackets and used pandas function ".iloc[39]" to locate the 39th artist in the dataset.
df_tracks[['artists']].iloc[39]

In [None]:
#Convert the Duration of songs which are in Milliseconds into seconds,
# i am using lamba function and dividing with 1000.
df_tracks['duration'] = df_tracks["duration_ms"].apply(lambda x: round(x/1000))
df_tracks.drop('duration_ms', inplace = True, axis = 1)

In [None]:
df_tracks.duration.head()

In [None]:
#Here we are going drop correlation map
corr_df = df_tracks.drop(["key","mode","explicit"],axis = 1).corr(method = "pearson")
plt.figure(figsize =(20,10))
heatmap = sns.heatmap(corr_df,annot=True, fmt =".1g",vmin = -1,vmax = 1, center = 0, cmap ="inferno", linewidths = 1, linecolor = "black") 
heatmap.set_title("Correlation Heatmap Between Variable")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation = 90)

In [None]:
#
sample_df = df_tracks.sample(int(0.004*len(df_tracks)))
print(len(sample_df))
                             

In [None]:
#
plt.figure(figsize=(16,8))
sns.regplot(data = sample_df, y = "loudness", x = "energy", color ="g").set(title = "Loudness vs Energy")

In [None]:
plt.figure(figsize=(16,8))
sns.regplot(data = sample_df, y = "popularity", x = "acousticness", color ="r").set(title = "Popularity vs Acousticness")

In [None]:
#Lets create a new column of "year" from "release date"

df_tracks['dates'] = df_tracks.index.get_level_values('release_date')
df_tracks.dates = pd.to_datetime(df_tracks.dates)
years = df_tracks.dates.dt.year

In [None]:
#lets create Histogram by using ".displot" 
sns.displot(years, discrete = True,aspect = 2,height = 5,kind='hist').set(title ='Number of songs per year')


In [None]:
# see the duration of songs over the years
#lets create a Bar plot.
total_dr = df_tracks.duration
fig_dims = (18,7)
fig,ax = plt.subplots(figsize =fig_dims)
fig = sns.barplot(x = years, y = total_dr,ax = ax, errwidth = False).set(title = 'Year vs Duration')
plt.xticks(rotation = 90)

In [None]:
#create a line plot of Average duration of songs over the years.
total_dr = df_tracks.duration
sns.set_style(style = "whitegrid")
fig_dims =(10,5)
fig, ax = plt.subplots(figsize = fig_dims)
fig = sns.lineplot(x = years, y =total_dr,ax = ax).set(title = 'Year vs Duration')
plt.xticks(rotation = 60)

In [None]:
#Lets check our Next data set of Genres
df_genre =pd.read_csv('SpotifyFeatures.csv')

In [None]:
df_genre.head()

In [None]:
#lets create Barplot by comparing Genre and Duration_ms in the given dataset.
plt.title("Duration of the songs in different Genre")
sns.color_palette("rocket",as_cmap =True)
sns.barplot(y = "genre", x = "duration_ms", data = df_genre)
plt.xlabel("Duration in Milliseconds")
plt.ylabel("Genre")


In [None]:
#Lets see the Top 5 Genres by Popularity
sns.set_style(style = "darkgrid")
plt.figure(figsize = (10,5))
famous_Top5songs = df_genre.sort_values('popularity', ascending = False).head(10)
sns.barplot(y = 'genre', x ='popularity', data = famous_Top5songs).set(title = "Top 5 Genres by popularity")
