In [2]:
import nbimporter
from Cleaning_Dataset import clean_diamonds, standard

Importing Jupyter notebook from Cleaning_Dataset.ipynb


In [31]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from matplotlib import pyplot as plt

In [18]:
original_train = pd.read_csv("../Data/train.csv")

In [19]:
original_test = pd.read_csv("../Data/test.csv")

In [20]:
original_train_cleaned = clean_diamonds(original_train)

In [21]:
original_test_cleaned = clean_diamonds(original_test)

In [22]:
original_train_cleaned.sample(1)

Unnamed: 0,carat,depth,table,x,y,z,price,COLOR,CUT,CLARITY
15556,0.73,61.6,56.0,5.79,5.84,3.58,7.954,3,5,6


### Model Building

In [23]:
X = original_train_cleaned.drop("price", axis = 1)
y = original_train_cleaned["price"]
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=123)

In [24]:
model = RandomForestRegressor()
parameter_space = {'n_estimators': [100, 300, 1000],
                   'max_features': ['sqrt', 0.5, None],
                   'max_depth': [None, 10, 30, 100],
                   'min_samples_leaf': [1, 3, 10]}

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           verbose=1,
                           n_jobs=-1,
                           cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 28.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 116.7min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 136.2min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 30, 100],
                         'max_features': ['sqrt', 0.5, None],
                         'min_samples_leaf': [1, 3, 10],
                         'n_estimators': [100, 300, 1000]},
             verbose=1)

In [25]:
grid_search.best_score_

0.9916824773434962

In [26]:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.9914990298584199

In [27]:
grid_search.best_params_

{'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [28]:
params = {'n_estimators': [1000, 2000, 3000, 4000],
          'max_depth': [grid_search.best_params_['max_depth']],
          'min_samples_leaf': [grid_search.best_params_['min_samples_leaf']],
          'max_features': [grid_search.best_params_['max_features']]}
grid_search = GridSearchCV(model,
                          param_grid=params,
                           cv=5,
                           n_jobs=-1,
                           verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 43.5min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [None], 'max_features': [None],
                         'min_samples_leaf': [1],
                         'n_estimators': [1000, 2000, 3000, 4000]},
             verbose=1)

In [33]:
best_rf = grid_search.best_estimator_
best_rf.score(X_test, y_test)

0.9915002524917815

In [29]:
grid_search.best_params_

{'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 3000}

Now that we have selected the parameters for our model, let´s train it with all the data

In [34]:
best_rf.fit(X, y)

RandomForestRegressor(max_features=None, n_estimators=3000)

### Real test

In [39]:
prediction = best_rf.predict(original_test_cleaned)

In [40]:
len(prediction)

13485

In [41]:
RF_df = pd.DataFrame(prediction, columns = ["price"])

In [42]:
RF_df["id"] = RF_df.index

In [43]:
RF_df = RF_df[["id", "price"]]

In [44]:
RF_df.to_csv(r"RF_df.csv", index = False)

In [45]:
!mv RF_df.csv ../Predictions/RF_df.csv