In [1]:
import pandas as pd 
import numpy as np
#import math

In [2]:
X_train = pd.read_csv('X_train.csv')
X_valid = pd.read_csv('X_valid.csv')
y_train = pd.read_csv('y_train.csv')
y_valid = pd.read_csv('y_valid.csv')

In [3]:
#concatenate training + validation sets 
X = pd.concat([X_train, X_valid])
y = pd.concat([y_train, y_valid])

In [4]:
X = X.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis = 1)
y = y.drop('Unnamed: 0', axis = 1)

In [5]:
X.head()

Unnamed: 0,item_type,utc_date,country_code,item_price,currency,numTracks,dateModified,datePublished,albumRelease,Genre,inAlbum,tags
0,a,1599956000.0,au,2.184,AUD,6.0,1305590000.0,1305518000.0,1,rock,is,other
1,t,1600334000.0,nz,1.3,GBP,1.0,1574759000.0,1574760000.0,1,electronic,in,house
2,a,1600598000.0,de,0.0,USD,3.0,1376506000.0,1359491000.0,1,electronic,is,80s
3,a,1601296000.0,nz,0.0,USD,25.0,1606288000.0,1567251000.0,1,experimental,is,disco
4,a,1601310000.0,fr,0.0,USD,6.0,1586739000.0,1413672000.0,1,metal,is,other


In [6]:
import datetime

In [7]:
utc_datetime = [datetime.datetime.utcfromtimestamp(x) for x in X['utc_date']]
day_month_year = [y.strftime("%a/%d") for y in utc_datetime]

In [8]:
#Bandcamp friday: first friday of the week
fridays = ['Fri/01', 'Fri/02', 'Fri/03', 'Fri/04', 'Fri/05', 'Fri/06', 'Fri/07']

In [9]:
BC_fridays = [1 if day in fridays else 0 for day in day_month_year]

In [10]:
X['BC_fridays'] = BC_fridays
X = X.drop(['utc_date', 'dateModified', 'datePublished'], axis = 1)

In [11]:
X.columns

Index(['item_type', 'country_code', 'item_price', 'currency', 'numTracks',
       'albumRelease', 'Genre', 'inAlbum', 'tags', 'BC_fridays'],
      dtype='object')

In [None]:
num_cols = ['item_price', 'numTracks', 'albumRelease']

In [13]:
for c in num_cols:
    X[c] = (X[c] - np.mean(X[c])) / np.std(X[c]) 

In [14]:
X.head()

Unnamed: 0,item_type,country_code,item_price,currency,numTracks,albumRelease,Genre,inAlbum,tags,BC_fridays
0,a,au,-0.486443,AUD,-0.02968,-0.37912,rock,is,other,0
1,t,nz,-0.725651,GBP,-0.515869,-0.37912,electronic,in,house,0
2,a,de,-1.077429,USD,-0.321394,-0.37912,electronic,is,80s,0
3,a,nz,-1.077429,USD,1.817836,-0.37912,experimental,is,disco,0
4,a,fr,-1.077429,USD,-0.02968,-0.37912,metal,is,other,0


In [15]:
X_d = pd.get_dummies(X)

In [16]:
X_d.shape, y.shape

((483876, 258), (483876, 1))

In [17]:
y = y['amount_paid_usd']
y = (y - np.mean(y))/ np.std(y)

#### random forest

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [20]:
rfr = RandomForestRegressor(random_state = 42)

In [27]:
parameters = {'n_estimators' : np.array([25, 50, 100, 200, 500]),
              'min_samples_split': np.array([1, 5, 10]),
              'max_features':np.array([0.3, 0.5, 0.8, 1])}

In [28]:
hyp_search = GridSearchCV(estimator = rfr, 
                          param_grid = parameters, 
                          cv = 5, 
                         verbose = 10,
                         scoring = 'neg_root_mean_squared_error')

In [None]:
%%time
hyp_search.fit(X_d, y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV 1/5; 1/60] START max_features=0.3, min_samples_split=1, n_estimators=25.....
[CV 1/5; 1/60] END max_features=0.3, min_samples_split=1, n_estimators=25;, score=-0.538 total time= 1.8min
[CV 2/5; 1/60] START max_features=0.3, min_samples_split=1, n_estimators=25.....
[CV 2/5; 1/60] END max_features=0.3, min_samples_split=1, n_estimators=25;, score=-0.538 total time= 1.9min
[CV 3/5; 1/60] START max_features=0.3, min_samples_split=1, n_estimators=25.....
[CV 3/5; 1/60] END max_features=0.3, min_samples_split=1, n_estimators=25;, score=-0.535 total time= 1.8min
[CV 4/5; 1/60] START max_features=0.3, min_samples_split=1, n_estimators=25.....
[CV 4/5; 1/60] END max_features=0.3, min_samples_split=1, n_estimators=25;, score=-0.539 total time= 1.8min
[CV 5/5; 1/60] START max_features=0.3, min_samples_split=1, n_estimators=25.....
[CV 5/5; 1/60] END max_features=0.3, min_samples_split=1, n_estimators=25;, score=-0.539 total time= 

In [None]:
hyp_search.best_params_

hyp_search.best_score_

In [None]:
hyp_search.best_estimator_

In [None]:
hyp_search.best_index_

In [None]:
hyp_search.scorer_(squared = True)