In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%config IPCompleter.greedy=True #to enable intellisense: see it when hitting tab
from sklearn.preprocessing import LabelEncoder # integer-encoding: to transform categorical variable into integer
from sklearn.preprocessing import OneHotEncoder  # OneHot-encoding
from sklearn.compose import ColumnTransformer #needed for OneHot-encoding
from sklearn.preprocessing import LabelEncoder  #needed for OneHot-encoding
from sklearn.preprocessing import StandardScaler # to scale our data
from sklearn.decomposition import PCA #PCA

In [2]:
# Set seed

random.seed(2312)

# Data prepocessing:

In [3]:
music_df = pd.read_csv("C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/SpotifyFeatures.csv") 

In [4]:
#check for NA's in Dataframe:
music_df.isnull().values.any()

False

In [5]:
#search for duplicates:
search = pd.DataFrame.duplicated(music_df)
print(search[search == True])#no duplicates

Series([], dtype: bool)


In [6]:
music_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


### Variables:
- popularity: ?
- key: The estimated overall key of the track. Integers map to pitches using standard Pitch Class notation (E.g.: 0 = C)
- mode: modality (major or minor) of a track (Major = 1 and Minor = 0) ? 
- time_signature:  estimated overall time signature (Taktangabe) of a track. Its a notational convention to specify how many beats are in each bar (or measure).
- title: maybe there is a way to also include strings? 
- genre: one-hot encoding
- track id: integer encoding

## Transform variables:

### 1. key:

In [7]:
music_df["key"].value_counts()

C     27583
G     26390
D     24077
C#    23201
A     22671
F     20279
B     17661
E     17390
A#    15526
F#    15222
G#    15159
D#     7566
Name: key, dtype: int64

In [8]:
## we are not using an integer encoder here because the numbers for the nodes are predetermined
# create dictionary for integer mapping:
key_mapping = {"C": 0,"G": 7,"D": 2, "C#": 1, "A": 9, "F": 5, "B": 11, "E": 4, "A#": 10, "F#": 6, "G#": 8, "D#": 3}
# map the nodes to integers:
music_df["key"] = music_df.key.map(key_mapping)
music_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,1,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,6,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,0,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,1,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,5,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


### 2. Genre:

In [9]:
music_df["genre"].value_counts()

Comedy              9681
Soundtrack          9646
Indie               9543
Jazz                9441
Pop                 9386
Electronic          9377
Children’s Music    9353
Folk                9299
Hip-Hop             9295
Rock                9272
Alternative         9263
Classical           9256
Rap                 9232
World               9096
Soul                9089
Blues               9023
R&B                 8992
Anime               8936
Reggaeton           8927
Ska                 8874
Reggae              8771
Dance               8701
Country             8664
Opera               8280
Movie               7806
Children's Music    5403
A Capella            119
Name: genre, dtype: int64

#### Hot-encoding of variable genre: 

In [10]:
# creating dummy dataframe
dum_df = pd.get_dummies(music_df["genre"], columns=["genre"], prefix="genre_is")
music_df = music_df.join(dum_df)
music_df = music_df.drop(["genre"],1)

In [11]:
list(music_df.columns)

['artist_name',
 'track_name',
 'track_id',
 'popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo',
 'time_signature',
 'valence',
 'genre_is_A Capella',
 'genre_is_Alternative',
 'genre_is_Anime',
 'genre_is_Blues',
 "genre_is_Children's Music",
 'genre_is_Children’s Music',
 'genre_is_Classical',
 'genre_is_Comedy',
 'genre_is_Country',
 'genre_is_Dance',
 'genre_is_Electronic',
 'genre_is_Folk',
 'genre_is_Hip-Hop',
 'genre_is_Indie',
 'genre_is_Jazz',
 'genre_is_Movie',
 'genre_is_Opera',
 'genre_is_Pop',
 'genre_is_R&B',
 'genre_is_Rap',
 'genre_is_Reggae',
 'genre_is_Reggaeton',
 'genre_is_Rock',
 'genre_is_Ska',
 'genre_is_Soul',
 'genre_is_Soundtrack',
 'genre_is_World']

In [12]:
# fixing the extra Childrenäs music dummy
music_df["genre_is_Children_music"] = music_df["genre_is_Children's Music"] + music_df["genre_is_Children’s Music"]

In [13]:
music_df["genre_is_Children_music"].describe()

count    232725.000000
mean          0.063405
std           0.243691
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: genre_is_Children_music, dtype: float64

In [14]:
music_df = music_df.drop(columns=["genre_is_Children's Music"])


In [15]:
music_df = music_df.drop(columns=["genre_is_Children’s Music"])

In [16]:
# dropping additional dummy
music_df = music_df.drop(columns=["genre_is_A Capella"])

In [17]:
list(music_df.columns)

['artist_name',
 'track_name',
 'track_id',
 'popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'mode',
 'speechiness',
 'tempo',
 'time_signature',
 'valence',
 'genre_is_Alternative',
 'genre_is_Anime',
 'genre_is_Blues',
 'genre_is_Classical',
 'genre_is_Comedy',
 'genre_is_Country',
 'genre_is_Dance',
 'genre_is_Electronic',
 'genre_is_Folk',
 'genre_is_Hip-Hop',
 'genre_is_Indie',
 'genre_is_Jazz',
 'genre_is_Movie',
 'genre_is_Opera',
 'genre_is_Pop',
 'genre_is_R&B',
 'genre_is_Rap',
 'genre_is_Reggae',
 'genre_is_Reggaeton',
 'genre_is_Rock',
 'genre_is_Ska',
 'genre_is_Soul',
 'genre_is_Soundtrack',
 'genre_is_World',
 'genre_is_Children_music']

In [18]:
len(music_df.columns)

42

## 3. Mode:

In [19]:
music_df["mode"].value_counts()

Major    151744
Minor     80981
Name: mode, dtype: int64

In [20]:
music_df["mode"] = music_df["mode"].replace({"Major": 1, "Minor":0})
music_df.rename(columns={"mode" : "Mode is Major"}, inplace=True)


In [21]:
music_df["Mode is Major"].value_counts()

1    151744
0     80981
Name: Mode is Major, dtype: int64

### 4. time_signature

I will transform the string into a fraction (float) (if thats correct? -> music expert Chan what do you think?)

In [22]:
music_df["time_signature"].value_counts()

4/4    200760
3/4     24111
5/4      5238
1/4      2608
0/4         8
Name: time_signature, dtype: int64

In [23]:
# create dictionary for integer mapping:
time_signature_mapping = {"4/4": 1,"3/4": 3/4,"5/4": 5/4, "1/4": 1/4, "0/4": 0}
# map the nodes to integers:
music_df["time_signature"] = music_df.time_signature.map(time_signature_mapping)
music_df["time_signature"].value_counts()

1.00    200760
0.75     24111
1.25      5238
0.25      2608
0.00         8
Name: time_signature, dtype: int64

In [24]:
# fixing wrong encoding
music_df.time_signature.replace((1,0.75,1.25,0.25,0), ('4/4','3/4','5/4','1/4','0/4'), inplace=True)
music_df['time_signature'] = music_df['time_signature'].astype(object)
dum_df = pd.get_dummies(music_df["time_signature"], columns=["time_signature"], prefix="time_sig_is")
music_df = music_df.join(dum_df)
music_df = music_df.drop(["time_signature"],1)

# removing the extra dummy to avoid the dummy variable trap
music_df = music_df.drop(columns=['time_sig_is_0/4'])

In [25]:
# viewing the final cleaned dataset
list(music_df.columns)

['artist_name',
 'track_name',
 'track_id',
 'popularity',
 'acousticness',
 'danceability',
 'duration_ms',
 'energy',
 'instrumentalness',
 'key',
 'liveness',
 'loudness',
 'Mode is Major',
 'speechiness',
 'tempo',
 'valence',
 'genre_is_Alternative',
 'genre_is_Anime',
 'genre_is_Blues',
 'genre_is_Classical',
 'genre_is_Comedy',
 'genre_is_Country',
 'genre_is_Dance',
 'genre_is_Electronic',
 'genre_is_Folk',
 'genre_is_Hip-Hop',
 'genre_is_Indie',
 'genre_is_Jazz',
 'genre_is_Movie',
 'genre_is_Opera',
 'genre_is_Pop',
 'genre_is_R&B',
 'genre_is_Rap',
 'genre_is_Reggae',
 'genre_is_Reggaeton',
 'genre_is_Rock',
 'genre_is_Ska',
 'genre_is_Soul',
 'genre_is_Soundtrack',
 'genre_is_World',
 'genre_is_Children_music',
 'time_sig_is_1/4',
 'time_sig_is_3/4',
 'time_sig_is_4/4',
 'time_sig_is_5/4']

In [26]:
music_df.head()

Unnamed: 0,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,...,genre_is_Rock,genre_is_Ska,genre_is_Soul,genre_is_Soundtrack,genre_is_World,genre_is_Children_music,time_sig_is_1/4,time_sig_is_3/4,time_sig_is_4/4,time_sig_is_5/4
0,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,1,...,0,0,0,0,0,0,0,0,1,0
1,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,6,...,0,0,0,0,0,0,0,0,1,0
2,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,0,...,0,0,0,0,0,0,0,0,0,1
3,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,1,...,0,0,0,0,0,0,0,0,1,0
4,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,5,...,0,0,0,0,0,0,0,0,1,0


## Drop all columns we are not using (string valued):

I am also removing track id because I think we dont need it right now 

In [27]:
music_df = music_df.drop(["artist_name","track_name", "track_id"],1)

## Export Encoded Data

In [28]:
pd.DataFrame.to_csv(music_df, "C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/EncodedData_updated.csv")

# Data Analysis

## 1. First Overview 

In [29]:
print(music_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 42 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   popularity               232725 non-null  int64  
 1   acousticness             232725 non-null  float64
 2   danceability             232725 non-null  float64
 3   duration_ms              232725 non-null  int64  
 4   energy                   232725 non-null  float64
 5   instrumentalness         232725 non-null  float64
 6   key                      232725 non-null  int64  
 7   liveness                 232725 non-null  float64
 8   loudness                 232725 non-null  float64
 9   Mode is Major            232725 non-null  int64  
 10  speechiness              232725 non-null  float64
 11  tempo                    232725 non-null  float64
 12  valence                  232725 non-null  float64
 13  genre_is_Alternative     232725 non-null  uint8  
 14  genr

In [30]:
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.neural_network import MLPRegressor

In [31]:
X, y = music_df.drop(["popularity"],1) , music_df["popularity"]

### Split data into training and test set:

In [32]:
X_tr, X_t, y_tr, y_t = train_test_split(X, y, 
                                        test_size=0.2, 
                                        random_state=0)

In [33]:
# Export to csv

X_tr.to_csv("C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/X_tr.csv")
X_t.to_csv("C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/X_t.csv")
y_tr.to_csv("C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/y_tr.csv")
y_t.to_csv("C:/Users/Shaurya/Documents/GitHub/ML-Lab/data/y_t.csv")