# CoderSchool Final Project Genres
## Music Recommendation System

In [82]:
import pandas as pd
import numpy as np

# Part 1 - Data cleaning

In [83]:
full_df = pd.read_json('MasterSongList.json')

### Clean the genres

We need to remove the list format

In [84]:
full_df2 = full_df.copy()
full_df2['genres'] = full_df2['genres'].apply(''.join)

And only want to keep the first genre

In [85]:
def split_first_genre(genre):
    if len(genre) > 0:
        return genre.split(':')[0]
    else:
        return genre

full_df2['genres'] = full_df2['genres'].apply(split_first_genre)

### Audio Features

We now only want to keep the audio features and the genre, let's create a new dataframe: df

In [86]:
features_headers = ['key', 'energy', 'liveliness', 'tempo', 'speechiness', 'acousticness', 'instrumentalness', 'time_signature', 'duration', 'loudness', 'valence', 'danceability', 'mode', 'time_signature_confidence', 'tempo_confidence', 'key_confidence', 'mode_confidence']
features_list = full_df2['audio_features'].tolist()
df = pd.DataFrame(features_list, columns=features_headers)
df['genres'] = full_df2['genres']

### NaN rows

Let's remove the songs with no genres

In [87]:
df['genres'].replace('', np.nan, inplace=True)
df.dropna(subset=['genres'], inplace=True)

Let's have a look at the NaN rows and their distribution among the genres

In [88]:
def checknan(x):
    return np.isnan(x)

In [89]:
genres_df = ['bluegrass', 'blues & blues rock', "children's", 'christian', 'classical', 'country', 'dance', "dubstep & drum 'n' bass", 'easy listening', 'electronica', 'film scores', 'folk', 'funk', 'hawaiian ', 'indie', "int'l", 'international/world', 'jazz', 'latin', 'nature sounds', 'oldies', 'pop', 'r&b', 'rap', 'reggae & ska', 'reggaeton', 'rock', 'showtunes', 'singer-songwriter']

The dataset is quite disbalanced. First, let's:
- drop the NaN rows when count is above 1000
- replace the NaN rows values by the median of the others when under 1000
- combine some of the similar genres with low number of rows: international & hawai, etc...

In [90]:
df_bal = df.copy()

Let'd group all the international songs

In [91]:
df_bal.loc[(df_bal['genres'].str.contains("hawa")), 'genres'] = 'international/world'
df_bal.loc[(df_bal['genres'] == "int'l"), 'genres'] = 'international/world'

In [92]:
df_bal.loc[(df_bal['genres'] == 'showtunes'), 'genres'] = 'film/show'
df_bal.loc[(df_bal['genres'] == 'film scores'), 'genres'] = 'film/show'

Let's differentiate genres that have more/less than 1000 non-NaN rows

In [93]:
new_genres_df = ['bluegrass', 'blues & blues rock', "children's", 'christian', 'classical', 'country', 'dance', "dubstep & drum 'n' bass", 'easy listening', 'electronica', 'film/show', 'folk', 'funk', 'indie', 'international/world', 'jazz', 'latin', 'nature sounds', 'oldies', 'pop', 'r&b', 'rap', 'reggae & ska', 'reggaeton', 'rock', 'singer-songwriter']

In [94]:
large_genres = []
small_genres = []

for i in new_genres_df:
    songs_genre = df_bal[df_bal['genres'] == i]
    songs_genre_nan = songs_genre['speechiness'].apply(checknan)
    if len(songs_genre_nan[songs_genre_nan == False]) >= 1000:
        large_genres.append(i)
    else:
        small_genres.append(i)

Let's drop NaN on large genres

In [95]:
new_df = pd.DataFrame()

for i in large_genres:
    songs = df_bal[df_bal['genres'] == i]
    new_songs = songs.dropna(axis=0, how='any')
    new_df = pd.concat([new_df, new_songs])

Let's replace NaN by median on small genres

In [96]:
for i in small_genres:
    songs = df_bal[df_bal['genres'] == i]
    new_songs = songs.fillna(songs.median())
    new_df = pd.concat([new_df, new_songs])

Now we don't have any NaN value left. However we can see below that the dataframe is not well balanced

# Part 2 - Select data

### Scale features

First let's randomize the data

In [126]:
new_df = new_df.sample(frac=1, random_state=101).reset_index(drop=True)

In [127]:
X = new_df.drop('genres', axis=1)
y = new_df['genres']

Let's scale the features

In [128]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

### Over and undersampling

In order to avoid imbalanced data, we will also try to use a combination of over and under sampling

In [100]:
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(X_scale, y)

We now also have 2 new data sources: X_resampled and y_resampled on which we could test our model

# Part 3: Try classifiers

For all our classifiers we will use a pipeline (classifier + SelectKBest) as we as GridSearchCV

In [101]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report

Here are the classifiers we are going to try:
- kNN on X_scale and y
- kNN on X_resampled and y_resampled
- LogReg on X_scale and y
- LogReg on X_resampled and y_resampled

### kNN on X_resampled and y_resampled

In [102]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=101)
knn2 = KNeighborsClassifier()
selector2 = SelectKBest()
steps_knn2 = [('feature_selection', selector2), ('kneighbors', knn2)]
parameters_knn2 = dict(feature_selection__k=[5,7,10,12], kneighbors__n_neighbors=[3,5,7,10])
pipeline_knn2 = Pipeline(steps_knn2)

In [104]:
grid_knn2 = GridSearchCV(pipeline_knn2, param_grid=parameters_knn2)
grid_knn2.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('feature_selection', SelectKBest(k=10, score_func=<function f_classif at 0x10edcda60>)), ('kneighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'feature_selection__k': [5, 7, 10, 12], 'kneighbors__n_neighbors': [3, 5, 7, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [105]:
#print(grid_knn2.best_estimator_)
predictions_knn2 = grid_knn2.predict(X_test)
#print(classification_report(y_test, predictions_knn2))

We can notice that the resampled data gives way better results. We will keep only this kNN classifier as the first result is too low. However we can note the computation time increased significantly: so we will reduce some of the parameters later on.

NOTE: we should be careful about overfitting with this specific model

### Logistic Regression

We will also use GridSearchCV for LogReg but we will include more parameters as several things might be interesting: class_weight (to balance automatically the data)

from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.1, random_state=101)

lr1 = LogisticRegression(max_iter=5000, class_weight='balanced')
selector_lr1 = SelectKBest()
steps_lr1 = [('feature_selection', selector_lr1), ('LogReg', lr1)]
parameters_lr1 = dict(feature_selection__k=[5,8,12], 
                      LogReg__solver=['newton-cg', 'sag', 'saga', 'lbfgs'],
                      LogReg__multi_class=['ovr', 'multinomial'])

pipeline_lr1 = Pipeline(steps_lr1)

grid_lr1 = GridSearchCV(pipeline_lr1, param_grid=parameters_lr1, verbose=3)
grid_lr1.fit(X_train, y_train)

print(grid_lr1.best_estimator_)
predictions_lr1 = grid_lr1.predict(X_test)
print(classification_report(y_test, predictions_lr1))

Let's try the same thing ut without the balanced data

lr2 = LogisticRegression(max_iter=5000)
selector_lr2 = SelectKBest()
steps_lr2 = [('feature_selection', selector_lr2), ('LogReg', lr2)]
parameters_lr2 = dict(feature_selection__k=[5,8,12], 
                      LogReg__solver=['newton-cg', 'sag', 'saga', 'lbfgs'],
                      LogReg__multi_class=['ovr', 'multinomial'])

pipeline_lr2 = Pipeline(steps_lr2)

grid_lr2 = GridSearchCV(pipeline_lr2, param_grid=parameters_lr2, verbose=3)
grid_lr2.fit(X_train, y_train)

print(grid_lr2.best_estimator_)
predictions_lr2 = grid_lr2.predict(X_test)
print(classification_report(y_test, predictions_lr2))

# Function

### Function 1: audio features formating

In [129]:
def format_audio(audio_features):
    features = np.asarray(audio_features)
    features = features.reshape(1,-1)
    from sklearn.preprocessing import StandardScaler
    test_song_scaled = scaler.transform(features)
    return test_song_scaled

### Function 2: go through classifier

In [130]:
def predict_genre(scaled_song):
    genre = grid_knn2.predict(scaled_song)
    return genre

### Testing

In [135]:
# print(full_df2['audio_features'][15])
# print(full_df2['genres'][15])

In [136]:
# results = format_audio(full_df2['audio_features'][15])
# print(results)

In [134]:
# predict_genre(results)