# Data Cleaning

In [1]:
import pandas as pd

## Data Loading

Imported and concatenated all raw datasets with scraped song release dates

In [13]:
part_one = pd.read_csv("Spotify_Youtube_with_release_first7000.csv")
part_two_one = pd.read_csv("7001_9999_update_date.csv")
part_two_two = pd.read_csv("10000_11998_update_date.csv")
part_two_three = pd.read_csv("11999_14000_update_date.csv")
part_three_one = pd.read_csv("RawData14001-17000_with_release.csv")
part_three_two = pd.read_csv("RawData17001-19000_with_release.csv")
part_three_three = pd.read_csv("RawData19001-20717_with_release.csv")

In [37]:
raw_data = pd.concat([part_one,part_two_one,part_two_two,part_two_three,part_three_one,part_three_two,part_three_three],ignore_index=True)
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Artist,Url_spotify,Track,Album,Album_type,Uri,Danceability,Energy,Key,...,Views,Likes,Comments,Description,Licensed,official_video,Stream,spotify_track_id,release_date_raw,release_date
0,0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,...,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...,True,True,1040235000.0,0d28khcov6AiegSCpG5TuT,2005-05-23,2005-05-23
1,1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,...,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...,True,True,310083700.0,1foMv2HQwfQ2vntFf9HFeG,2010-03-03,2010-03-03
2,2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,spotify:track:64dLd6rVqDLtkXFYrEUHIU,0.695,0.923,1.0,...,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...,True,True,63063470.0,64dLd6rVqDLtkXFYrEUHIU,2022-08-31,2022-08-31
3,3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,Plastic Beach,album,spotify:track:0q6LuUqGLUiCPP1cbdwFs3,0.689,0.739,2.0,...,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...,True,True,434663600.0,0q6LuUqGLUiCPP1cbdwFs3,2010-03-03,2010-03-03
4,4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,Gorillaz,album,spotify:track:7yMiX7n9SBvadzox8T5jzT,0.663,0.694,10.0,...,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...,True,True,617259700.0,7yMiX7n9SBvadzox8T5jzT,2001,2001-01-01


## Data Pruning

Dropped irrelevant columns like singer page URL and Youtube video description

In [38]:
raw_data = raw_data.drop(['Unnamed: 0','Url_spotify','Description'],axis=1)

Checked which rows contain NA values

In [53]:
raw_data.isna().sum()

Artist                0
Track                 0
Album                 0
Album_type            0
Uri                   0
Danceability          2
Energy                2
Key                   2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Url_youtube         470
Title               470
Channel             470
Views               470
Likes               541
Comments            569
Licensed            470
official_video      470
Stream              576
spotify_track_id      0
release_date_raw      4
release_date         94
dtype: int64

Removed 1,117 songs with missing information in YouTube video/licensing details, streaming data, song attributes, or release date. In all these cases interpolation does not make sense

In [97]:
data = raw_data.dropna(subset=["Url_youtube","Stream","Danceability", "release_date"])

Because we are only interested in licensed and not fan-uploaded videos, we keep only songs with licensed videos on YouTube that is not a lyric video.

In [100]:
data = data[data["Licensed"]==True]
data = data[data["official_video"]==True]
data = data[~data["Title"].str.contains(r"lyric", case=False, na=False)]

In [104]:
data.head()

Unnamed: 0,Artist,Track,Album,Album_type,Uri,Danceability,Energy,Key,Loudness,Speechiness,...,Channel,Views,Likes,Comments,Licensed,official_video,Stream,spotify_track_id,release_date_raw,release_date
0,Gorillaz,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,-6.679,0.177,...,Gorillaz,693555221.0,6220896.0,169907.0,True,True,1040235000.0,0d28khcov6AiegSCpG5TuT,2005-05-23,2005-05-23
1,Gorillaz,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,-5.815,0.0302,...,Gorillaz,72011645.0,1079128.0,31003.0,True,True,310083700.0,1foMv2HQwfQ2vntFf9HFeG,2010-03-03,2010-03-03
2,Gorillaz,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,spotify:track:64dLd6rVqDLtkXFYrEUHIU,0.695,0.923,1.0,-3.93,0.0522,...,Gorillaz,8435055.0,282142.0,7399.0,True,True,63063470.0,64dLd6rVqDLtkXFYrEUHIU,2022-08-31,2022-08-31
3,Gorillaz,On Melancholy Hill,Plastic Beach,album,spotify:track:0q6LuUqGLUiCPP1cbdwFs3,0.689,0.739,2.0,-5.81,0.026,...,Gorillaz,211754952.0,1788577.0,55229.0,True,True,434663600.0,0q6LuUqGLUiCPP1cbdwFs3,2010-03-03,2010-03-03
4,Gorillaz,Clint Eastwood,Gorillaz,album,spotify:track:7yMiX7n9SBvadzox8T5jzT,0.663,0.694,10.0,-8.627,0.171,...,Gorillaz,618480958.0,6197318.0,155930.0,True,True,617259700.0,7yMiX7n9SBvadzox8T5jzT,2001,2001-01-01
