# <center style="color: blue" >Part 1 - Preparing the dataset (Cleaned Version)</center>


***
## <center style="color: blue" >Imports needed for data exploration </center>

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


***
## <center style="color: blue" >Import the data set</center>

In [27]:
music_df = pd.read_csv('../data/charts.csv')

***
## <center style="color: blue" >Explore the data set</center>

In [28]:
music_df.head()

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956.0


In [29]:
music_df.tail()

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
26173509,BYE,46,2021-07-31,Jaden,https://open.spotify.com/track/3OUyyDN7EZrL7i0...,Vietnam,viral50,MOVE_UP,
26173510,Pillars,47,2021-07-31,My Anh,https://open.spotify.com/track/6eky30oFiQbHUAT...,Vietnam,viral50,NEW_ENTRY,
26173511,Gái Độc Thân,48,2021-07-31,Tlinh,https://open.spotify.com/track/2klsSb2iTfgDh95...,Vietnam,viral50,MOVE_DOWN,
26173512,Renegade (feat. Taylor Swift),49,2021-07-31,Big Red Machine,https://open.spotify.com/track/1aU1wpYBSpP0M6I...,Vietnam,viral50,MOVE_DOWN,
26173513,Letter to Jarad,50,2021-07-31,"LRN Slime, Shiloh Dynasty",https://open.spotify.com/track/508QhA2SncMbh5C...,Vietnam,viral50,MOVE_DOWN,


In [30]:
music_df.shape

(26173514, 9)

In [31]:
music_df.dtypes

title       object
rank         int64
date        object
artist      object
url         object
region      object
chart       object
trend       object
streams    float64
dtype: object

In [32]:
music_df.columns

Index(['title', 'rank', 'date', 'artist', 'url', 'region', 'chart', 'trend',
       'streams'],
      dtype='object')

In [33]:
music_df.isnull().sum()

title           11
rank             0
date             0
artist          18
url              0
region           0
chart            0
trend            0
streams    5851610
dtype: int64

## <center style="color: blue" >Remove unwanted rows and columns.</center>

### Drop rows where title or artist is null.

<span style="color: red">OBS -  We don't want to drop rows with null in streams, since it doesn't mean that there is missing values - just missing listeners.</span>

In [34]:
music_df = music_df.dropna(subset=['title', 'artist'])

In [35]:
# Check that values have been properly removed.
music_df.isnull().sum()

title            0
rank             0
date             0
artist           0
url              0
region           0
chart            0
trend            0
streams    5851588
dtype: int64

### Drop rows with duplicates in both title and artist.

Since we only want to make predictions on wheither a song can hit a top list or not, we don't need duplicates of the same song from different weeks.
    
(A song appearing on a list for more than one week.)

In [36]:
music_df = music_df.drop_duplicates(subset=['title', 'artist'], keep="first")
music_df

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956.0
...,...,...,...,...,...,...,...,...,...
26160571,Imparfait inconnu,28,2021-07-28,Adé,https://open.spotify.com/track/3W7x967esdawMwz...,France,viral50,NEW_ENTRY,
26162386,Still Holy (feat. Ryan Ofei & Naomi Raine),47,2021-07-28,"Tribl, Maverick City Music",https://open.spotify.com/track/4Cw36mfQ1z4JnjF...,South Africa,viral50,NEW_ENTRY,
26165777,怎叹,48,2021-07-29,郑鱼,https://open.spotify.com/track/02jBlxoKmrSXrHE...,Singapore,viral50,NEW_ENTRY,
26168170,ילד השדה,45,2021-07-30,Yehuda Elias,https://open.spotify.com/track/4JaLSvGiHdm4RK8...,Israel,viral50,NEW_ENTRY,


### Drop columns that we can't use for predictions.

In [37]:
# Finding unique types in chart
music_df.chart.unique()

array(['top200', 'viral50'], dtype=object)

**There are two different values in "Chart" - 'top200' and 'viral50'.**

In this project we are interested in inspecting and operating on data for the top chart of spotify. 

This is in our opinion the 'top200'-list, as the 'viral50'-list is a list determinated by Social media. 

The more talk and chatting on a specific track, determine the probability of ending up on the 'viral50'-list.

This is the reason we  remove this from the dataset.

In [39]:
music_df = music_df[music_df.chart.str.contains("viral50") == False]

In [40]:
music_df

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956.0
...,...,...,...,...,...,...,...,...,...
24917263,คิดมาก,157,2017-12-31,Palmy,https://open.spotify.com/track/7fil1CerP6aA0hU...,Thailand,top200,NEW_ENTRY,1445.0
24917289,โต้รุ่ง (feat. โจอี้ บอย),183,2017-12-31,"Boom Boom Cash, Joey Boy",https://open.spotify.com/track/3QSftcwKhotprns...,Thailand,top200,NEW_ENTRY,1285.0
25088276,Na Szczycie,182,2018-01-29,Grubson,https://open.spotify.com/track/7aQViYYpIkpJwyC...,Poland,top200,NEW_ENTRY,7386.0
25096450,All Right,175,2018-01-29,Red Velvet,https://open.spotify.com/track/2SahFuW1EZN9MXN...,Thailand,top200,NEW_ENTRY,1488.0


**The dataset has a trend column. This is the movement of a song on the chart, whether it has moved up or down the chart. We know that this dataset is collected weekley, and therefore there are dublicates of the songs on different positions. We are not interested in the same song multiple times, so we will try to remove these dublicates so we can get a much cleaner dataset.**


**The trend colum is in our oppinion not nessesary for analysing the dataset, while we do not make any measurements on the movement of the individual tracks. This is the reason why we remove it.**

**And the column chart only holds a single value 'top200', so now we can remove this one aswell.**

In [41]:
music_df = music_df.drop(['trend', 'chart'], axis=1)

In [45]:
# Check what columns we have left.
list(music_df)

['title', 'rank', 'date', 'artist', 'url', 'region', 'streams']

In [46]:
# Check the shape
music_df.shape

(100220, 7)

***
## <center style="color: blue" >Convert data values</center>

In [47]:
# Convert streams column to integer values - and convert NaN values to 0.
music_df['streams'] = np.nan_to_num(music_df['streams']).astype(int)

In [48]:
# Check that values have been converted properly.
music_df.head()

Unnamed: 0,title,rank,date,artist,url,region,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,253019
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,223988
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,210943
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,173865
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,153956


In [49]:
music_df.dtypes

title      object
rank        int64
date       object
artist     object
url        object
region     object
streams     int32
dtype: object

***
## <center style="color: blue" >Reset index after cleaning the dataset.</center>

In [50]:
# Sort the values based on the column 'rank'.
music_df = music_df.sort_values('rank', ascending=True)

In [51]:
# Reset the index files, so every entry has their own id.
music_df.reset_index(drop=True, inplace=True)

In [52]:
music_df

Unnamed: 0,title,rank,date,artist,url,region,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,253019
1,Waterval,1,2021-11-28,K3,https://open.spotify.com/track/6Y3VGa2XlHkuJnS...,Belgium,73917
2,Kafamda kentsel dönüşümler,1,2020-08-01,İkiye On Kala,https://open.spotify.com/track/7iEfRnNqAcTse7A...,Turkey,140603
3,POLSKIE TANGO,1,2020-08-01,"Taco Hemingway, Lanek",https://open.spotify.com/track/2fNLY6Ks5tKg2e4...,Poland,157811
4,Bir İhtimal Biliyorum,1,2019-01-01,Gülşen,https://open.spotify.com/track/24mpJgP2YZUBif9...,Turkey,108528
...,...,...,...,...,...,...,...
100215,Tuổi Gì Mà Chẳng Thích Lì Xì,200,2020-01-20,"Bich Phuong, Bình Gold",https://open.spotify.com/track/6tOyEmIo6e3VzO9...,Vietnam,2321
100216,El Problema,200,2020-05-01,Ricardo Arjona,https://open.spotify.com/track/5UJsYyBi0CdSJl0...,Guatemala,3445
100217,未見過世面的招積,200,2021-10-19,Kay Tse,https://open.spotify.com/track/6Eh74dmYzQtnMUy...,Hong Kong,3232
100218,Beamer Boi,200,2020-05-02,"Khontkar, Myndless Grimes",https://open.spotify.com/track/7zzI3SbctTJN6wD...,Turkey,21700


***
## <center style="color: blue" >Save the cleaned dataset</center>

In [53]:
#Saving found data to csv file
from pathlib import Path  
path = Path('../data/Data sets (Martins stages)/Stage 1 - Cleaned dataset.csv')  
path.parent.mkdir(parents=True, exist_ok=True)  
music_df.to_csv(path)