In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("Data/clean1.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price
0,0,1.14,5,4,9,61.0,56.0,9013
1,1,0.76,5,3,7,62.7,57.0,2692
2,2,0.84,5,4,8,61.4,56.0,4372
3,3,1.55,5,3,8,62.0,57.0,13665
4,4,0.3,5,4,5,61.9,57.0,422


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
y=df["price"]
X=df.drop(columns=["price","id"])
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [8]:
X.shape

(40455, 6)

In [9]:
y.shape

(40455,)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [14]:
models={
    "Linear" : LinearRegression(),
    "Dec Tree" : DecisionTreeRegressor(),
    "KNeighb" : KNeighborsRegressor(),
    "Grad" : GradientBoostingRegressor(),
    "Random Forest" : RandomForestRegressor()
}

In [15]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('RMSE : ', np.sqrt(mean_squared_error(y_test, y_pred)))
    print("_")

------Linear------
RMSE :  1255.545434743393
_
------Dec Tree------
RMSE :  712.3997269616142
_
------KNeighb------
RMSE :  1930.56842562406
_
------Grad------
RMSE :  608.8667594139549
_
------Random Forest------
RMSE :  547.297739118158
_


In [17]:
for i in range (1,15):
    tree = DecisionTreeRegressor(max_depth=i)
    tree.fit(X_train,y_train)
    y_pred = tree.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 1  -->  RMSE:  2486.4103301309638
Max Depth = 2  -->  RMSE:  1671.1426815066864
Max Depth = 3  -->  RMSE:  1356.8185339228803
Max Depth = 4  -->  RMSE:  1160.658175869778
Max Depth = 5  -->  RMSE:  992.1395677973164
Max Depth = 6  -->  RMSE:  853.8736702550822
Max Depth = 7  -->  RMSE:  759.3035752391564
Max Depth = 8  -->  RMSE:  695.7495157169509
Max Depth = 9  -->  RMSE:  635.0020444432281
Max Depth = 10  -->  RMSE:  607.6057344393325
Max Depth = 11  -->  RMSE:  609.110152394459
Max Depth = 12  -->  RMSE:  621.7984716243036
Max Depth = 13  -->  RMSE:  638.9194150697227
Max Depth = 14  -->  RMSE:  657.0538014765027


In [18]:
for i in [1000,1100,1200]:
    grad = GradientBoostingRegressor(n_estimators=i)
    grad.fit(X_train,y_train)
    y_pred = grad.predict(X_test)
    print(f"n_estimators = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

n_estimators = 1000  -->  RMSE:  546.7668605579724
n_estimators = 1100  -->  RMSE:  546.5015696793434
n_estimators = 1200  -->  RMSE:  547.6642706716153


In [19]:
for i in range(12,25):
    forest = RandomForestRegressor(max_depth=i)
    forest.fit(X_train,y_train)
    y_pred = forest.predict(X_test)
    print(f"Max Depth = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth = 12  -->  RMSE:  537.6456610231786
Max Depth = 13  -->  RMSE:  537.7821738974774
Max Depth = 14  -->  RMSE:  536.6811311655384
Max Depth = 15  -->  RMSE:  540.0028741255622
Max Depth = 16  -->  RMSE:  538.0733309111358
Max Depth = 17  -->  RMSE:  542.2606078698295
Max Depth = 18  -->  RMSE:  542.2831443566719
Max Depth = 19  -->  RMSE:  547.444899199351
Max Depth = 20  -->  RMSE:  545.5679644682621
Max Depth = 21  -->  RMSE:  543.1452749605526
Max Depth = 22  -->  RMSE:  546.0383866661322
Max Depth = 23  -->  RMSE:  546.4278180448731
Max Depth = 24  -->  RMSE:  544.6579838580791


In [25]:
for i in [80,90,100,110,120,130,140]:
    forest = RandomForestRegressor(max_depth=14,n_estimators=i)
    forest.fit(X_train,y_train)
    y_pred = forest.predict(X_test)
    print(f"Max Depth=14, n_stimators = {i}  -->  RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))

Max Depth=14, n_stimators = 80  -->  RMSE:  543.333522718814
Max Depth=14, n_stimators = 90  -->  RMSE:  536.7783099126824
Max Depth=14, n_stimators = 100  -->  RMSE:  535.6205954105397
Max Depth=14, n_stimators = 110  -->  RMSE:  538.6089973791865
Max Depth=14, n_stimators = 120  -->  RMSE:  538.4180741463022
Max Depth=14, n_stimators = 130  -->  RMSE:  537.9727293413971
Max Depth=14, n_stimators = 140  -->  RMSE:  539.3897830357942


#### Best RMSE comes with max_depth=14 and n_estimators=100

## Apply model to the dataset

In [27]:
ranforest = RandomForestRegressor()

In [28]:
ranforest.fit(X_train, y_train)

RandomForestRegressor()

In [29]:
y_pred = ranforest.predict(X_test)

In [33]:
from sklearn import metrics
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE:  548.0354922408554


In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [49]:
params = {'max_depth': [15,16],
 'bootstrap': [True, False],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [3],
 'n_estimators': [100]}

In [105]:
params2 = {'max_depth': [14,15,16,17],
 'bootstrap': [True, False],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2,3],
 'n_estimators': [100]}

In [126]:
params3 = {'max_depth': [14,15],
 'max_features': ['auto'],
 'min_samples_leaf': [3,4,5],
 'n_estimators': [100]}

In [146]:
params4 = {'max_depth': [15,200],
 'max_features': ['auto'],
 'min_samples_leaf': [5,6,7],
 'n_estimators': [100]}

In [50]:
grid = GridSearchCV(forest, params, verbose=1)

In [106]:
grid2 = GridSearchCV(forest, params2, verbose=1)

In [127]:
grid3 = GridSearchCV(forest, params3, verbose=1)

In [147]:
grid4 = GridSearchCV(forest, params4, verbose=1)

In [51]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(estimator=RandomForestRegressor(max_depth=14, n_estimators=140),
             param_grid={'bootstrap': [True, False], 'max_depth': [15, 16],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [3], 'n_estimators': [100]},
             verbose=1)

In [108]:
grid2.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  14.8s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  14.7s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  14.6s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  14.6s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  15.2s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  14.1s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  14.1s
[CV] END bootstrap=True, max_depth=14, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  14.7s
[CV] END bootstrap

[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=2, n_estimators=100; total time=  13.9s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=2, n_estimators=100; total time=   9.4s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=2, n_estimators=100; total time=   8.2s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=2, n_estimators=100; total time=   8.2s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=3, n_estimators=100; total time=   8.6s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=3, n_estimators=100; total time=   9.0s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=3, n_estimators=100; total time=   7.4s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=3, n_estimators=100; total time=   7.5s
[CV] END bootstrap=True, max_depth=17, max_features=sqrt, min_samples_leaf=3, n_

[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  37.7s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  36.4s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=2, n_estimators=100; total time=  38.0s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  41.4s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  46.0s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  39.6s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  34.9s
[CV] END bootstrap=False, max_depth=17, max_features=auto, min_samples_leaf=3, n_estimators=100; total time=  40.2s
[CV] END bootstrap=False, max_depth=17, max_features=sqrt, min_samples_l

GridSearchCV(estimator=RandomForestRegressor(bootstrap=15, max_depth=15,
                                             min_samples_leaf=3),
             param_grid={'bootstrap': [True, False],
                         'max_depth': [14, 15, 16, 17],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [2, 3], 'n_estimators': [100]},
             verbose=2)

In [128]:
grid3.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=RandomForestRegressor(bootstrap=15, max_depth=15,
                                             min_samples_leaf=3),
             param_grid={'max_depth': [14, 15], 'max_features': ['auto'],
                         'min_samples_leaf': [3, 4, 5], 'n_estimators': [100]},
             verbose=1)

In [148]:
grid4.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(estimator=RandomForestRegressor(bootstrap=15, max_depth=15,
                                             min_samples_leaf=3),
             param_grid={'max_depth': [15, 200], 'max_features': ['auto'],
                         'min_samples_leaf': [5, 6, 7], 'n_estimators': [100]},
             verbose=1)

In [52]:
print(grid.best_params_)

{'bootstrap': True, 'max_depth': 15, 'max_features': 'auto', 'min_samples_leaf': 3, 'n_estimators': 100}


In [109]:
print(grid2.best_params_)

{'bootstrap': True, 'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 3, 'n_estimators': 100}


In [129]:
print(grid3.best_params_)

{'max_depth': 14, 'max_features': 'auto', 'min_samples_leaf': 3, 'n_estimators': 100}


In [150]:
print(grid4.best_params_)

{'max_depth': 200, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 100}


In [53]:
forest = RandomForestRegressor(bootstrap = 15, max_features = 'auto',max_depth = 15, min_samples_leaf = 3, n_estimators = 100)

In [110]:
forest2 = RandomForestRegressor(bootstrap = 14, max_features = 'auto',max_depth = 15, min_samples_leaf = 3, n_estimators = 100)

In [131]:
forest3 = RandomForestRegressor(bootstrap = True, max_features = 'auto',max_depth = 14, min_samples_leaf = 3, n_estimators = 100)

In [151]:
forest4 = RandomForestRegressor(bootstrap = True, max_features = 'auto',max_depth = 200, min_samples_leaf = 5, n_estimators = 100)

In [54]:
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=15, max_depth=15, min_samples_leaf=3)

In [111]:
forest2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=14, max_depth=15, min_samples_leaf=3)

In [132]:
forest3.fit(X_train, y_train)

RandomForestRegressor(max_depth=14, min_samples_leaf=3)

In [152]:
forest4.fit(X_train, y_train)

RandomForestRegressor(max_depth=200, min_samples_leaf=5)

In [55]:
y_pred = forest.predict(X_test)

In [112]:
y_pred2 = forest2.predict(X_train)

In [133]:
y_pred3 = forest3.predict(X_train)

In [165]:
y_pred4 = forest4.predict(X_train)

In [56]:
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE:  537.8211553504631


In [57]:
model2 = forest.fit(X,y)

In [113]:
model4 = forest2.fit(X,y)

In [134]:
model6 = forest3.fit(X,y)

In [166]:
model8 = forest4.fit(X,y)

In [93]:
test = pd.read_csv("Data/cleantest1.csv")

In [135]:
test3 = pd.read_csv("Data/cleantest1.csv")

In [167]:
test4 = pd.read_csv("Data/cleantest1.csv")

In [116]:
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1,1,5,56.3,64.0,3303.188375
1,0.83,4,4,6,62.3,58.0,3069.334328
2,1.0,1,6,5,67.0,53.0,3329.018462
3,1.0,1,3,5,66.5,62.0,3191.589884
4,1.2,3,2,6,62.6,57.0,5356.717846


In [136]:
test3.shape

(13485, 6)

In [115]:
test.shape

(13485, 7)

In [118]:
test.drop(columns=["price"], inplace=True)

In [97]:
price = model2.predict(test)

In [119]:
price2 = model4.predict(test)

In [138]:
price3 = model4.predict(test3)

In [168]:
price4 = model8.predict(test4)

In [120]:
test['price'] = price

In [139]:
test3['price'] = price3

In [169]:
test4['price'] = price4

In [121]:
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1,1,5,56.3,64.0,3303.188375
1,0.83,4,4,6,62.3,58.0,3069.334328
2,1.0,1,6,5,67.0,53.0,3329.018462
3,1.0,1,3,5,66.5,62.0,3191.589884
4,1.2,3,2,6,62.6,57.0,5356.717846


In [140]:
test3.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.0,1,1,5,56.3,64.0,3296.457004
1,0.83,4,4,6,62.3,58.0,3046.889142
2,1.0,1,6,5,67.0,53.0,3340.810663
3,1.0,1,3,5,66.5,62.0,3162.066993
4,1.2,3,2,6,62.6,57.0,5398.056514


In [170]:
test4.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,1.0,1,1,5,56.3,64.0,3292.95228
1,0.83,4,4,6,62.3,58.0,2982.091263
2,1.0,1,6,5,67.0,53.0,3379.307379
3,1.0,1,3,5,66.5,62.0,3132.649582
4,1.2,3,2,6,62.6,57.0,5356.77139


In [122]:
finaldf = test.drop(['carat', 'cut', 'color', 'clarity', 'depth', 'table'], axis=1)

In [123]:
finaldf.index.rename('id', inplace=True)

In [124]:
finaldf.sample(5)

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
7773,1651.770598
12882,6140.017452
4077,3524.152348
8439,15458.718443
12710,12917.058928


In [144]:
finaldf.to_csv('Data/finaldf1.csv')

In [141]:
finaldf3 = test3.drop(['carat', 'cut', 'color', 'clarity', 'depth', 'table'], axis=1)

In [142]:
finaldf3.index.rename('id', inplace=True)

In [143]:
finaldf3.sample(5)

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
6761,1548.657602
9393,17047.45641
8870,915.21445
2483,2649.245311
11021,877.772913


In [145]:
finaldf3.to_csv('Data/finaldf3.csv')

In [171]:
finaldf4 = test4.drop(['carat', 'cut', 'color', 'clarity', 'depth', 'table'], axis=1)

In [172]:
finaldf4.index.rename('id', inplace=True)

In [173]:
finaldf4.sample(5)

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
10547,2801.701136
3282,5508.986261
8332,1654.191833
8610,6602.817733
8471,2283.928861


In [174]:
finaldf4.to_csv('Data/finaldf4.csv')