In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score,make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from metrics import partsMetrics,allMetrics

In [2]:
df=pd.read_csv('processed.csv')
print('Data shape:',df.shape)

Data shape: (467, 7)


In [3]:
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values
print(X.shape)
print(y.shape)

(467, 6)
(467,)


In [4]:
param_grid={
    'criterion': ["squared_error", "friedman_mse", "absolute_error", "poisson"],
    'splitter': ['best','random'],
    'ccp_alpha': np.logspace(-1,-3,3),
    'max_depth': [None, 5, 6, 7, 8, 9, 10, 20, 40, 100, 200],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': range(1,7),
    'max_features': [None,1,2,3,4,'sqrt','log2']
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

def train_model(X_train,y_train):
    reg = DecisionTreeRegressor(random_state=0)
    grid=GridSearchCV(reg,param_grid,cv=5,scoring=make_scorer(r2_score),n_jobs=2)
    grid.fit(X_train, y_train)
    print(grid.best_params_)
    print(grid.best_score_)
    return grid

In [190]:
model=train_model(X_train,y_train)

{'ccp_alpha': 0.001, 'criterion': 'absolute_error', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
0.8649856189755049


In [5]:
best_params={'ccp_alpha': 0.001, 'criterion': 'absolute_error', 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
reg=DecisionTreeRegressor(**best_params,random_state=0)
partsMetrics(df,reg)

train rmse: 0.20000487758602553
val rmse: 4.788249319701205
test rmse: 5.6048297941397465

train si: 0.01836001662986378
val si: 0.4002397608339896
test si: 0.5460079598817893

train r2: 0.9996178352865215
val r2: 0.8273664602916038
test r2: 0.7493853625690461

train mape: 1.5059937736284719
val mape: 6.304281613588829
test mape: 149.20595684602034


In [6]:
allMetrics(df,reg)

all rmse: 0.2534553736997746
all si: 0.022993920705763858
all r2: 0.9994266519093994
all mape: 2.7028900608241666
