# Project 6 Moosic (unsupervised ML --> playlists)

### Import necessary libraries

In [None]:
import os
import pandas as pd
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.cluster import Kmeans
import matplotlib.pyplot as plt
import seaborn as sns

### Load the data and set `song_name` and `artist` as the index

In [None]:
# small dataset 10 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_10_songs\df_audio_features_10.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","song_name"])

In [None]:
# big dataset 1000 songs
path = r"C:\Users\krugm\OneDrive\Work\Data Science\WBS\Bootcamp\Bootcamp\Project 6 - Moosic\Data\audio_features_1000_songs\df_audio_features_1000.csv"
songs_data_df = pd.read_csv(path, index_col=["artist","song_name"])

In [None]:
songs_data_df

### Drop unnecessary columns

In [None]:
songs_data_clean_df = songs_data_df.drop(["id", "html", "duration_ms", "time_signature", "mode", "key", "speechiness"], axis=1)

In [None]:
songs_data_clean_df

## Use SciKit-Learn to calculate the euclidean distances

In [None]:
eucl_song = pd.DataFrame(pairwise_distances(songs_data_clean_df),
                    index=songs_data_df.index,
                    columns=songs_data_df.index)

In [None]:
eucl_song

### Plot a heatmap of the correlating euclidean distances

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(eucl_song);

## Use SciKit-Learn to calculate the Manhattan distances

In [None]:
manh_song = pd.DataFrame(manhattan_distances(songs_data_clean_df), 
                         index=songs_data_df.index, 
                         columns=songs_data_df.index)

In [None]:
manh_song

### Plot a heatmap of the correlating Manhattan distances

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(manh_song);

# Data Scaling

### Manual Approach

In [None]:
def min_max_scaling(col):
    min_col = min(col)
    max_col = max(col)
    mima_val = []
    for val in col:
        mima_val.append((val-min_col) / (max_col - min_col))
    return mima_val

In [None]:
def standard_scaling(col):
    mean = col.mean()
    std = col.std()
    stand_val = []
    for val in col:
        stand_val.append((val-mean) / std)
    return stand_val

### Comparison of different Scalings of euclidean distances

In [None]:
pd.DataFrame({
    "original" : eucl_song.iloc[:,0],
    "min_max_scaling" : min_max_scaling(eucl_song.iloc[:,0]),
    "standard_scaling" : standard_scaling(eucl_song.iloc[:,0])
})

### Comparison of different Scalings of Manhattan distances

In [None]:
pd.DataFrame({
    "original" : manh_song.iloc[:,0],
    "min_max_scaling" : min_max_scaling(manh_song.iloc[:,0]),
    "standard_scaling" : standard_scaling(manh_song.iloc[:,0])
})

## Normalisation of all columns of the songs DataFrame (min_max_scaling)

In [None]:
songs_norm_mima_df = eucl_song.apply(min_max_scaling, axis=0).copy()

In [None]:
songs_norm_mima_df

### Normalised Heatmap (min_max_scaling)

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(songs_norm_mima_df);

## Normalisation of all columns of the songs DataFrame (min_max_scaling)

In [None]:
songs_norm_stand_df = eucl_song.apply(standard_scaling, axis=0).copy()

In [None]:
songs_norm_stand_df

### Normalised Heatmap (standard_scaling)

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(songs_norm_stand_df);

# SciKit Learn Normalisation (Scalers)

### MinMaxScaler

In [None]:
song_sk_pre_mima = MinMaxScaler().fit_transform(eucl_song)
song_sk_pre_mima_df = pd.DataFrame(song_sk_pre_mima, columns=eucl_song.columns, index=eucl_song.index)

In [None]:
song_sk_pre_mima_df

###  RobustScaler

In [None]:
song_sk_pre_robu = RobustScaler().fit_transform(eucl_song)
song_sk_pre_robu_df = pd.DataFrame(song_sk_pre_robu, columns=eucl_song.columns, index=eucl_song.index)

In [None]:
song_sk_pre_robu_df

### StandardScaler

In [None]:
song_sk_pre_stand = StandardScaler().fit_transform(eucl_song)
song_sk_pre_stand_df = pd.DataFrame(song_sk_pre_stand, columns=eucl_song.columns, index=eucl_song.index)

In [None]:
song_sk_pre_stand_df

### QuantileTransformer

In [None]:
song_sk_pre_quat = QuantileTransformer().fit_transform(eucl_song)
song_sk_pre_quat_df = pd.DataFrame(song_sk_pre_quat, columns=eucl_song.columns, index=eucl_song.index)

# Heatmaps from SkiKit-Learn Normalisation

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(song_sk_pre_mima_df);

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(song_sk_pre_robu_df);

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(song_sk_pre_stand_df);

In [None]:
plt.subplots(figsize=(12, 8))
sns.heatmap(song_sk_pre_quat_df);