# Spotify Exploratory Data Analysis

## In this project we will be performing spotify analysis with data available in kaggle platform, Here we'll be using multiple libraries to analyze and visualize the dataset

# Importing Libraries

In [None]:
import pandas as pd                    #data manipulation and analysis
import numpy as np                     #To perform mathematical operations on array
import seaborn as sns                  
import matplotlib.pyplot as plt        #used to generate visualizations

# Importing Data

### Data Collection

In [None]:
spotify = pd.read_csv('/kaggle/input/30000-spotify-songs/spotify_songs.csv')
spotify                                                                       #read data

In [None]:
spotify.drop(['track_id'],axis=1,inplace=True)                          #To remove one or more rows and columns
spotify                                                                

In [None]:
spotify.drop(['track_album_id','playlist_id'],axis=1,inplace=True)
spotify

# Lets Perform Data Description

In [None]:
spotify.head()     # Let check the top dataset

In [None]:
spotify.tail()      # Now let check the bottom dataset

In [None]:
spotify.shape       #rows 32833 and 20 columns

In [None]:
spotify.info()   #its is used to enable better communicating with other developers like a specific cell to one store value.

In [None]:
spotify.describe()       #all in one

In [None]:
spotify.columns        #all column names

# Data Cleaning and Preparation

#### Finding Null values

In [None]:
spotify.isnull().sum()      # we'll check the null values in the dataset

In [None]:
# representing null values in graph
Null_Per = (spotify.isnull().sum() / len(spotify)) * 100

#plotting in a bar graph
plt.figure(figsize = (10,6))
Null_Per.plot(kind = 'bar')
plt.title("Percentage of Null values By Column", fontsize = 25)
plt.xlabel("Columns", fontsize = 20)
plt.ylabel("Percentage Of Null Values", fontsize = 20)
plt.xticks(rotation = 90)
plt.tight_layout()
plt.show()


#### So we could see that there are null values in the column track name,track artist,track album name

#### Handling missing values

In [None]:
spotify['track_name'].fillna('Unknown', inplace = True)    #replace null values with specified values
spotify['track_artist'].fillna('Unknown', inplace = True)
spotify['track_album_name'].fillna('Not Found', inplace = True)

# Exploratory Data Analysis

## 1. Who are the Top 5 most Popular Artists ?

In [None]:
top_five_artists = spotify.groupby("track_artist").count().sort_values(by="track_name", ascending=False)["track_name"][:5]
top_five_artists

In [None]:
top_five_artists.plot.barh()
plt.title("Top 5 popular artist")
plt.ylabel('track_artist',fontsize=10)                 #Note labelling the y-label
plt.xlabel('count',fontsize=10)                 #Note labelling the x-label
plt.show()

#### According to the chart the Top 5 Artists based on the number of the songs released are 
#### 1.Martin Garrix
#### 2.Queen
#### 3.The Chainsmokers,
#### 4.David Guetta,
#### 5.Don Omar,

## 2.Which are the Top 5 loudest tracks ?

In [None]:
top_five_loudest_tracks = spotify[["loudness","track_name"]].sort_values(by="loudness",ascending=False)[:5]
top_five_loudest_tracks

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x="loudness",y="track_name",data=top_five_loudest_tracks)
plt.title("Top 5 loudest tracks")
plt.ylabel('track_name',fontsize=10)                 #Note labelling the y-label
plt.xlabel('loudness',fontsize=10)    
plt.show()

#### The Top 5 Loudest Track with loudness in decreasing order are 
#### Raw Power - Iggy Pop Mix(1.275), 
#### Escape From Love - Curbi Remix (1.135),
 #### Rockstar (0.642),
 #### Nails (0.551),
  #### Crema (0.326),

## 3.Who are the Top 5 Artists with most Danceability song ?

In [None]:
top_five_artists_danceable_songs = spotify[["danceability","track_name","track_artist"]].sort_values(by="danceability", ascending=False)[:5]
top_five_artists_danceable_songs
                                                                                                                                    

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x="danceability",y="track_artist",data=top_five_artists_danceable_songs)
plt.title("Top 5 Artist with the Most Danceability Track")
plt.show()

#### From the above chart the top artists with most danceability track are 
#### Fusion Groove Orchestra(0.983),
#### DJ ZsuZsu(0.981),
#### DJ Goozo(0.979),
#### Vanilla Ice(0.979),
#### DJ Casper(0.978)

## 4.Which are the Top 5 Instrumental Tracks ?

In [None]:
top_five_instrumental_tracks = spotify[["instrumentalness","track_name","track_artist"]].sort_values(by="instrumentalness",ascending=False)[:5]
top_five_instrumental_tracks

In [None]:
plt.figure(figsize=(7,5))
plt.pie(x="instrumentalness", data=top_five_instrumental_tracks, autopct='%1.2f%%', labels=top_five_instrumental_tracks.track_name)
plt.title("Top 5 Instrumentalness Tracks")
plt.show()

#### The top Instrumental tracks with instrumentalness in percentage are
#### 1.Chill Waves & Wind in Leaves
#### 2.Summer Rain
#### 3.Sandstorm - Radio Edit
#### 4.Genesis
#### 5.Sandstorm - Radio Edit

## 5.Top 10 Energetic tracks

In [None]:
top_ten_energy_tracks = spotify[["energy","track_name"]].sort_values(by="energy",ascending=False)[:10]
top_ten_energy_tracks

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(x="track_name", data=top_ten_energy_tracks , order = top_ten_energy_tracks['track_name'].value_counts().index)   #Note here we are using seaborn as sns which is an in-nuilt library in python that uses mtplotlib underneath to plot graphs.
plt.xticks(rotation='vertical')
plt.title('Top 10 Energetic Tracks',fontsize=15)    #Note labelling the data
plt.ylabel('energy',fontsize=10)                 #Note labelling the y-label
plt.xlabel('track_name',fontsize=10)                 #Note labelling the x-label
plt.show()

#### The Top 10 Energetic tracks are
#### 1.Forest Rain (1.000)
#### 2.Chill Waves & Wind in Leaves (1.000)
#### 3.Rain Forest and Tropical Beach Sound (1.000)
#### 4.Run To You (0.999)
#### 5.Satisfaction - RL Grime Remix (0.999)
#### 6.Gentle Waves on Rocks (0.999)
#### 7.Staatsfeind (0.999)
#### 8.Nightfall by the Sea (0.999)
#### 9.Captain Jack - Short Mix (0.999)
#### 10.Immortal - Single Edit (0.999)

## 6. Loudness VS Energy

In [None]:
loudness_vs_energy  = spotify[["loudness","energy"]].sort_values(by="loudness",ascending=False)[:1000]
loudness_vs_energy 

In [None]:
plt.figure(figsize=(12,7))
sns.scatterplot(data = loudness_vs_energy ,y= "loudness", x= "energy", color = "c").set(title = "loudness vs energy")

## 7.Duration of tracks

In [None]:
duration = spotify[["duration_ms"]].sort_values(by="duration_ms",ascending=False)[:1000]
duration

In [None]:
plt.figure(figsize=(12,7))
hist_plot = duration['duration_ms'].hist(bins=100)
hist_plot.set_title('duration of tracks')
hist_plot.set_xlabel('duration_ms')
hist_plot.set_ylabel('count')

# Conclusion

**Spotify Exploratory Data Analysis helps to visualize and find the Top Popular Artist,Loudest tracks,Artists with most Danceability tracks, Tracks with high Instrumentalness, The Top Energetic tracks, Loudness Vs Energy and Number of albums with gigh durations.
Further we could continue the project by analyzing and visualize charts for playlist genre and their sub genre, create a histograms for tempo, speechiness, acousticness, instrumentalness, liveness and valence.**                    
  