# Second visualization
## Which song has higher chance of bigger hit based on the weeks on rank (artist)

In [1]:
# import necessary library
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from random import gauss
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats as stats

from sklearn import metrics
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_regression

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

%matplotlib inline
plt.style.use('seaborn')

In [2]:
# open csv file that will be used
df_00 = pd.read_csv('data/df_00.csv')
df_10 = pd.read_csv('data/df_10.csv')

In [3]:
df_00.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50650 entries, 0 to 50649
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track             50650 non-null  object 
 1   artist            50650 non-null  object 
 2   danceability      50650 non-null  float64
 3   energy            50650 non-null  float64
 4   key               50650 non-null  int64  
 5   loudness          50650 non-null  float64
 6   mode              50650 non-null  int64  
 7   speechiness       50650 non-null  float64
 8   acousticness      50650 non-null  float64
 9   instrumentalness  50650 non-null  float64
 10  liveness          50650 non-null  float64
 11  valence           50650 non-null  float64
 12  tempo             50650 non-null  float64
 13  duration_ms       50650 non-null  int64  
 14  time_signature    50650 non-null  int64  
 15  chorus_hit        50650 non-null  float64
 16  sections          50650 non-null  int64 

Create pipe line

In [4]:
X0, y0 = df_00.select_dtypes(exclude=['object']), df_00['Weeks.on.chart']

In [5]:
# train and test split
X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size=0.2, 
                                                    random_state=27)

In [6]:
pipeline = Pipeline([ ('imputer', SimpleImputer(strategy = 'median')), 
                     ('std_scaler', StandardScaler()), 
                     ('dt_clf', DecisionTreeClassifier(random_state = 42))])

# Train the pipeline (tranformations & predictor)
# fit -> fit with all ml and transform data, but final estimator will not create prediction
pipeline.fit(X0_train, y0_train)

# Predict using the pipeline (includes the transfomers & trained predictor)
predicted = pipeline.predict(X0_test)

# find best estimator score
score = pipeline.score(X0_test, y0_test)

print(f"""
predicted: {predicted}
score: {score}
""")


predicted: [13 10  4 ...  5 14  9]
score: 0.9998025666337611



the score is very high using decision tree classifier

create pipeline with grid search <br>
used lecture: model_tuning_and_pipelines as reference

In [7]:
pipeline1 = Pipeline([ ('imputer', SimpleImputer(strategy = 'median')), 
                     ('std_scaler', StandardScaler()), 
                     ('knn', KNeighborsClassifier())
                    ])

# Train the pipeline (tranformations & predictor)
# fit -> fit with all ml and transform data, but final estimator will not create prediction
pipeline1.fit(X0_train, y0_train)

# Predict using the pipeline (includes the transfomers & trained predictor)
predicted = pipeline1.predict(X0_test)

# find best estimator score
score = pipeline1.score(X0_test, y0_test)

print(f"""
predicted: {predicted}
score: {score}
""")


predicted: [10  6  3 ...  3 10  6]
score: 0.0613030602171767



In [8]:
pipe_grid = {'knn__n_neighbors' : [1,7, 9], 'knn__p': [2,3,4]}

# estimator is not KN_neighbors, 
gs_pipe = GridSearchCV(estimator = pipeline1 , param_grid = pipe_grid, cv = 5)

In [9]:
gs_pipe.fit(X0_train, y0_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer',
                                        SimpleImputer(strategy='median')),
                                       ('std_scaler', StandardScaler()),
                                       ('knn', KNeighborsClassifier())]),
             param_grid={'knn__n_neighbors': [1, 7, 9], 'knn__p': [2, 3, 4]})

score is lower with using k-fold

In [10]:
pipe_svc = make_pipeline(SimpleImputer(),StandardScaler(),PCA(n_components=2),SVC(random_state=1))

param_range = [0.001,0.01,0.1,1,10,100,1000]

param_grid = {'svc__C': [0.001,0.01,0.1,1,10,100,1000], 'svc__kernel': ['linear', 'rbf'],
              'svc__gamma': [0.001,0.01,0.1,1,10,100,1000]}

cv = StratifiedKFold(n_splits=5)

gs = GridSearchCV(estimator=pipe_svc,param_grid=param_grid, scoring='accuracy', cv = cv,
                  return_train_score=True)

In [None]:
gs.fit(X0_train, y0_train)



In [None]:
# print all results from gridsearch

print("Best Estimator: \n{}\n".format(gs.best_estimator_))
print("Best Parameters: \n{}\n".format(gs.best_params_))
print("Best Test Score: \n{}\n".format(gs.best_score_))
print("Best Training Score: \n{}\n".format(gs.cv_results_['mean_train_score'][gs.best_index_]))
print("All Training Scores: \n{}\n".format(gs.cv_results_['mean_train_score']))
print("All Test Scores: \n{}\n".format(gs.cv_results_['mean_test_score']))

In [8]:
df_00_tmp = df_00.drop(columns = ['Date', 'Features', 'Weeks.on.chart'])
y = df_00['Weeks.on.chart']

X_train, X_test, y_train, y_test = train_test_split(
    df_00_tmp, y, test_size=0.3, random_state=42)

# # clean train set
# X_train_nums = X_train.select_dtypes('float64')
# X_train_cat = X_train.select_dtypes('object')

# # clean test set
# X_test_nums = X_test.select_dtypes('float64')
# X_test_cat = X_test.select_dtypes('object')

In [9]:
# split between numerical vs categorical

X_train_nums = X_train.select_dtypes('float64')

ss = StandardScaler()

ss.fit(X_train_nums)
nums_df = pd.DataFrame(ss.transform(X_train_nums),
                      index=X_train_nums.index)

In [10]:
X_train_cat = X_train.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(X_train_cat)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_train_cat.index)

In [20]:
numerical_pipeline = Pipeline(steps = [('ss', StandardScaler())])

categorical_pipeline = Pipeline(steps = [
                                            ('ohe', OneHotEncoder(drop = 'first', sparse = False))
                                        ])

transformColumns = ColumnTransformer(transformers = [
    ('numerical', numerical_pipeline, X_train_nums.columns), 
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

In [21]:
model_pipe = Pipeline(steps = [
    ('transformColumns', transformColumns),
    ('knn', KNeighborsClassifier())
])

In [22]:
model_pipe.fit(X_train, y_train)
# takes some times to run

Pipeline(steps=[('transformColumns',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'chorus_hit'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index(['track', 'artist', 'Week', 'Genre'], dtype='object'))])),
                ('knn', KNeighborsClassifier())])

In [23]:
model_pipe.score(X_test, y_test)

0.006633761105626851

In [29]:
pipe_grid = {'knn__n_neighbors' : [1,7, 9], 'knn__p': [2,3,4]}

# estimator is not KN_neighbors, 
gs_pipe = GridSearchCV(estimator = model_pipe , param_grid = pipe_grid, cv = 5)

In [30]:
gs_pipe.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 1157, in fit
    return self._fit(X)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 419, in _fit
    raise ValueError("Metric '%s' not valid for sparse input. "
ValueError: Metric 'minkowski' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\mode

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 1157, in fit
    return self._fit(X)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 419, in _fit
    raise ValueError("Metric '%s' not valid for sparse input. "
ValueError: Metric 'minkowski' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\mode

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 1157, in fit
    return self._fit(X)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 419, in _fit
    raise ValueError("Metric '%s' not valid for sparse input. "
ValueError: Metric 'minkowski' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\mode

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 1157, in fit
    return self._fit(X)
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\neighbors\_base.py", line 419, in _fit
    raise ValueError("Metric '%s' not valid for sparse input. "
ValueError: Metric 'minkowski' not valid for sparse input. Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) to get valid options. Metric can also be a callable function.

Traceback (most recent call last):
  File "C:\Users\saryun\Anaconda3\envs\learn-env\lib\site-packages\sklearn\mode

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('transformColumns',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         Index(['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'chorus_hit'],
      dtype='object')),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                               

In [32]:
pd.DataFrame(gs_pipe.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.091212,0.036696,4.769631,0.244317,1,2,"{'knn__n_neighbors': 1, 'knn__p': 2}",0.001777,0.001974,0.001579,0.001579,0.001579,0.001698,0.000158,3
1,0.061709,0.014401,0.0,0.0,1,3,"{'knn__n_neighbors': 1, 'knn__p': 3}",,,,,,,,4
2,0.069898,0.015607,0.0,0.0,1,4,"{'knn__n_neighbors': 1, 'knn__p': 4}",,,,,,,,5
3,0.0763,0.014681,5.92432,0.261726,7,2,"{'knn__n_neighbors': 7, 'knn__p': 2}",0.012833,0.013228,0.014413,0.009477,0.009082,0.011807,0.002132,2
4,0.056616,0.008338,0.0,0.0,7,3,"{'knn__n_neighbors': 7, 'knn__p': 3}",,,,,,,,6
5,0.061278,0.011092,0.0,0.0,7,4,"{'knn__n_neighbors': 7, 'knn__p': 4}",,,,,,,,7
6,0.076394,0.015336,6.044566,0.203976,9,2,"{'knn__n_neighbors': 9, 'knn__p': 2}",0.013031,0.015005,0.011649,0.01382,0.011056,0.012912,0.001432,1
7,0.067492,0.012848,0.0,0.0,9,3,"{'knn__n_neighbors': 9, 'knn__p': 3}",,,,,,,,8
8,0.070128,0.017695,0.0,0.0,9,4,"{'knn__n_neighbors': 9, 'knn__p': 4}",,,,,,,,9


In [33]:
gs_pipe.best_estimator_.score(X_test, y_test)

0.009608423823626193

In [34]:
gs_pipe.best_params_

{'knn__n_neighbors': 9, 'knn__p': 2}

### the score is almost 0, it is not working