In [22]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import math
from numpy.random import RandomState
state = 322

from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

from sklearn.linear_model import LogisticRegression 

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_curve, roc_auc_score, precision_score, recall_score, mean_absolute_error, mean_squared_error

from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer



In [2]:
df_train = pd.read_csv('C:\data/kaggle_music_genre_train.csv')
df_test = pd.read_csv('C:\data/kaggle_music_genre_test.csv')
sample = pd.read_csv('C:\data/sample_submit.csv')

In [3]:
df_train.columns

Index(['instance_id', 'track_name', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
       'loudness', 'mode', 'speechiness', 'tempo', 'obtained_date', 'valence',
       'music_genre'],
      dtype='object')

In [4]:
cols_to_drop = ['duration_ms', 'obtained_date', 'track_name', 'loudness', 'key', 'mode', 'instance_id']

In [5]:
X = df_train.drop(columns='music_genre')
y = df_train.music_genre
X = X.drop(cols_to_drop, axis=1)
X_test = df_test.drop(cols_to_drop, axis=1)
X.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,0.48,0.67,0.351,0.0176,0.115,0.0463,101.384,0.45
1,0.243,0.452,0.67,5.1e-05,0.108,0.0352,113.071,0.539
2,0.228,0.454,0.804,0.0,0.181,0.371,80.98,0.344
3,0.0558,0.847,0.873,3e-06,0.325,0.0804,116.007,0.966
4,0.227,0.742,0.575,2e-06,0.176,0.0487,76.494,0.583


In [6]:
y.head()

0        Country
1           Rock
2    Alternative
3        Hip-Hop
4    Alternative
Name: music_genre, dtype: object

In [7]:
le = LabelEncoder()


In [8]:
y = le.fit_transform(y)

In [9]:
y

array([4, 9, 0, ..., 0, 8, 5])

In [10]:
X.columns

Index(['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'valence'],
      dtype='object')

In [11]:
X['tempo'] = X['tempo'].fillna(120)

In [12]:
num_columns = X.select_dtypes(include='number').columns

In [13]:
num_columns

Index(['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'valence'],
      dtype='object')

In [14]:
from sklearn.pipeline import Pipeline, make_pipeline

pipeline = Pipeline([
                     ('scaler', StandardScaler()), 
                     ('model', RandomForestClassifier())])
pipeline


In [15]:
column_transformer = make_column_transformer((StandardScaler(), make_column_selector(dtype_include='number')),
                                             (OneHotEncoder(drop='first'), make_column_selector(dtype_include='category')),
                                              remainder='passthrough')
column_transformer

In [16]:
model = RandomForestClassifier(random_state=42)
pipeline = make_pipeline(column_transformer, model)
pipeline

In [17]:
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['acousticness',
 'danceability',
 'energy',
 'instrumentalness',
 'liveness',
 'speechiness',
 'tempo',
 'valence']

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [19]:
pipeline.fit(X_train, y_train)

In [26]:
pipeline.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000211CB960A00>),
                                 ('onehotencoder', OneHotEncoder(drop='first'),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x00000211CB960DC0>)]),
 'randomforestclassifier': RandomForestClassifier(random_state=42)}