# Training the Dataset  🏃🏻

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import math
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv('./output/diamond_train_numeric.csv', index_col=0)
train.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.3,3,1,1,62.8,56.0,4.29,4.31,2.7,605
1,0.34,4,2,2,62.6,55.0,4.46,4.49,2.8,565
2,0.4,3,3,2,60.3,62.0,4.7,4.75,2.85,720
3,0.4,5,4,3,61.8,59.2,4.72,4.74,2.92,793
4,0.9,3,3,2,61.0,63.0,6.1,6.13,3.73,4381


In [3]:
train.shape

(40455, 10)

In [4]:
test = pd.read_csv('./output/diamond_test_numeric.csv', index_col=0)
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2.36,4,7,4,60.8,54.0,8.68,8.57,5.24
1,2.04,4,4,4,62.0,56.0,8.18,8.23,5.09
2,0.51,4,7,2,61.7,54.0,5.18,5.19,3.2
3,0.3,4,7,2,61.3,56.0,4.32,4.33,2.65
4,0.96,1,4,1,68.8,56.0,6.11,5.98,4.16


In [5]:
test.shape

(13485, 9)

--------

---------

## Checking out the RMSE of each model

In [6]:
X = train.drop(columns=["price"])
y = train["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
models = {
    "forest200": RandomForestRegressor(n_estimators=200),
    "forest400": RandomForestRegressor(n_estimators=400),
    "boosting_450": GradientBoostingRegressor(n_estimators=450),
    "boosting_750": GradientBoostingRegressor(n_estimators=750),
    "boosting_950": GradientBoostingRegressor(n_estimators=950)
}

In [9]:
for name, model  in models.items():
    print(f"Starting training")
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    print(f"Training complete")

Starting training
Training forest200...
Train complete
Starting training
Training forest400...
Train complete
Starting training
Training boosting_450...
Train complete
Starting training
Training boosting_750...
Train complete
Starting training
Training boosting_950...
Train complete


In [10]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in models.items():
    y_pred = m.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model forest200
	 RMSE: 569.3948
Evaluating model forest400
	 RMSE: 571.5887
Evaluating model boosting_450
	 RMSE: 627.7155
Evaluating model boosting_750
	 RMSE: 603.3193
Evaluating model boosting_950
	 RMSE: 594.4404


-------

-------

## Hyperparameter optimization with GridSearchCV

#### RandomForest Hiperparameters

In [14]:
parameters = {'n_estimators': [200, 600],  
 'max_features': ['auto', 'sqrt']}

In [None]:
hyperGrid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=parameters, verbose=1)
hyperFit = hyperGrid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [1]:
print(hyperFit.best_params_)

NameError: name 'hyperFit' is not defined

-------

-----

### Pred N1 w/ GradientBoostingRegressor950

In [13]:
X = train.drop(columns=["price"])
y = train["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
model = GradientBoostingRegressor(n_estimators=950)
fitted_model = model.fit(X, y)

In [15]:
y_pred = fitted_model.predict(X_test)

In [16]:
pred = pd.DataFrame(y_pred).reset_index()
pred = pred.rename(columns={"index": "id", 0: "price"})
pred.head()

Unnamed: 0,id,price
0,0,6541.127269
1,1,4454.568225
2,2,792.163608
3,3,475.962968
4,4,5231.745767


In [17]:
pred.shape

(8091, 2)

In [19]:
y_pred_final = fitted_model.predict(test)
y_pred_final

array([14950.78916486, 15958.48030685,  1182.00126183, ...,
        2351.993423  ,  1073.8631075 ,  2539.38868582])

In [22]:
pred = pd.DataFrame(y_pred_final).reset_index()
pred = pred.rename(columns={"index": "id", 0: "price"})
pred.head()

Unnamed: 0,id,price
0,0,14950.789165
1,1,15958.480307
2,2,1182.001262
3,3,426.088792
4,4,3480.987319


In [31]:
pred.shape

(13485, 2)

In [23]:
pred.to_csv('./output/pred1.csv', header=True, index=False)

--------

### Pred N2 w/ RandomForestRegressor200

models = {
    "linealReg": LinearRegression(),
    "forest100": RandomForestRegressor(n_estimators=100),
    "forest200": RandomForestRegressor(n_estimators=200),
    "tree": DecisionTreeRegressor(random_state=0),
    "neigbor":KNeighborsRegressor(),
    "boosting": GradientBoostingRegressor(n_estimators=500),
}