# Dependencies

In [122]:
import pandas as pd
import category_encoders as ce

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler

# Import Features Dataframe

In [123]:
feature_df = pd.read_csv('features.csv', skiprows=[1,2,3], low_memory=False)

In [124]:
feature_df = feature_df.rename(columns={'feature':'track_id'})

# Import Core Dataframe

In [125]:
core_df = pd.read_csv('tracks.csv', skiprows=1, index_col=0, engine='python', error_bad_lines=False)

Skipping line 53396: unexpected end of data


# Subset, Clean, and Merge DataFrames

In [126]:
tracks = core_df[ ['producer', 'location', 'composer','type','id','genre_top'] ]
tracks = tracks.drop('track_id', axis=0)

In [127]:
tracks = tracks.reset_index()
tracks = tracks.rename(columns={'index':'track_id'})

In [128]:
tracks['track_id'] = tracks['track_id'].astype(int)

In [129]:
tracks = tracks.dropna(subset=['genre_top'])

In [130]:
merged = pd.merge(tracks, feature_df, on='track_id', how='left')

In [131]:
merged = merged.drop(columns={'track_id', 'id'})

# Test/Train Split

In [132]:
X = merged.drop(columns='genre_top', axis=1)
y = merged['genre_top']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((22740, 522), (5686, 522), (22740,), (5686,))

# Reduce the number of categories in y_train

In [133]:
y_train.value_counts()

Rock                   7180
Experimental           4684
Electronic             4106
Hip-Hop                1465
Folk                   1211
Pop                     934
Classical               783
International           667
Instrumental            501
Jazz                    402
Old-Time / Historic     322
Spoken                  237
Country                  89
Soul-RnB                 81
Blues                    72
Easy Listening            6
Name: genre_top, dtype: int64

In [134]:
def top_genre(track):
    top_genres = ['Rock', 'Experimental', 'Electronic', 'Hip-Hop', 'Folk', 'Pop', 'Classical', 'Other']
    if track in top_genres:
        return track
    else: 
        return 'Other'

In [135]:
y_train = y_train.apply(top_genre)
y_test = y_test.apply(top_genre)

# Build preprocessing pipeline

In [136]:
hash_features = X_train.columns.tolist()[:4]
pca_features = X_train.columns.tolist()[4:]

In [141]:
pca_processing = make_pipeline(
    StandardScaler(),
    SimpleImputer(strategy='mean'),
    PCA(n_components=15)
)

hash_processing = make_pipeline(
    ce.HashingEncoder(n_components=8)
)

one_hot_processing = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True)
)

preprocess = make_column_transformer(
    (hash_processing, 'producer'),
    (hash_processing, 'location'),
    (hash_processing, 'composer'),
    (one_hot_processing, 'type'),
    (pca_processing, pca_features)
)

# Execute Model

In [142]:
model = make_pipeline(
    preprocess,
    LogisticRegression(solver='lbfgs', max_iter=2000, multi_class='auto')
)

In [143]:
model = model.fit(X_train, y_train)
print('Train logistic regression score:', model.score(X_train, y_train))
print('Test logistic regression score:', model.score(X_test, y_test))

Train logistic regression score: 0.4310905892700088
Test logistic regression score: 0.42279282448118183
