In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

In [2]:
diamonds= pd.read_csv("../input_diamonds/diamonds_train.csv")
diamonds.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.21,Ideal,H,VS2,63.0,57.0,6.73,6.7,4.23,6134
1,1,0.28,Very Good,D,VVS2,64.0,56.0,4.14,4.17,2.66,532
2,2,0.42,Premium,F,VS1,61.2,58.0,4.86,4.82,2.96,1103
3,3,0.26,Ideal,H,IF,61.1,57.0,4.16,4.12,2.53,600
4,4,1.1,Good,G,SI1,63.4,57.0,6.52,6.55,4.14,4997


**Convert categorical to numerical:**

In [3]:
diamonds["cut"].value_counts()

Ideal        16090
Premium      10339
Very Good     9036
Good          3694
Fair          1186
Name: cut, dtype: int64

In [4]:
diamonds["cut"].replace(["Premium", "Ideal", "Very Good","Good","Fair"],[1,2,3,4,5],inplace=True)
diamonds["cut"].value_counts()

2    16090
1    10339
3     9036
4     3694
5     1186
Name: cut, dtype: int64

In [5]:
diamonds["color"].value_counts()

G    8499
E    7351
F    7130
H    6234
D    5074
I    4023
J    2034
Name: color, dtype: int64

In [6]:
diamonds["color"].replace(["G", "E", "F","H","D","I","J"],[1,2,3,4,5,6,7],inplace=True)
diamonds["color"].value_counts()

1    8499
2    7351
3    7130
4    6234
5    5074
6    4023
7    2034
Name: color, dtype: int64

In [7]:
diamonds["clarity"].value_counts()

SI1     9751
VS2     9178
SI2     6828
VS1     6101
VVS2    3858
VVS1    2714
IF      1362
I1       553
Name: clarity, dtype: int64

In [8]:
diamonds["clarity"].replace(["SI1", "VS2", "SI2","VS1","VVS2","VVS1","IF","I1"],[1,2,3,4,5,6,7,8],inplace=True)
diamonds["clarity"].value_counts()

1    9751
2    9178
3    6828
4    6101
5    3858
6    2714
7    1362
8     553
Name: clarity, dtype: int64

In [9]:
diamonds=diamonds[(diamonds.x!=0) & (diamonds.y!=0) & (diamonds.z!=0)]
diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.21,2,4,2,63.0,57.0,6.73,6.70,4.23,6134
1,1,0.28,3,5,5,64.0,56.0,4.14,4.17,2.66,532
2,2,0.42,1,3,4,61.2,58.0,4.86,4.82,2.96,1103
3,3,0.26,2,4,7,61.1,57.0,4.16,4.12,2.53,600
4,4,1.10,4,1,1,63.4,57.0,6.52,6.55,4.14,4997
...,...,...,...,...,...,...,...,...,...,...,...
40340,40340,1.55,1,4,2,61.3,61.0,7.46,7.39,4.55,11708
40341,40341,0.36,2,5,1,60.6,56.0,4.58,4.63,2.79,619
40342,40342,0.57,3,6,2,62.2,55.0,5.33,5.34,3.32,1267
40343,40343,1.01,3,3,7,59.6,62.0,6.47,6.56,3.88,9965


**Remove columns correlated:**

In [10]:
corr = diamonds.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
Unnamed: 0,1.0,0.005632,0.001839,0.006175,-0.002281,0.008125,-0.007072,0.003929,0.006146,0.005053,0.007359
carat,0.005632,1.0,0.017064,0.188677,-0.16141,0.029475,0.181157,0.978398,0.94717,0.97656,0.921889
cut,0.001839,0.017064,1.0,0.018488,-0.008137,0.307625,0.068872,0.004019,0.011763,0.046539,-0.030065
color,0.006175,0.188677,0.018488,1.0,-0.112289,0.024641,0.043802,0.172107,0.166353,0.173704,0.088605
clarity,-0.002281,-0.16141,-0.008137,-0.112289,1.0,-0.023917,-0.10029,-0.187997,-0.179998,-0.187972,-0.085002
depth,0.008125,0.029475,0.307625,0.024641,-0.023917,1.0,-0.288893,-0.023214,-0.027701,0.097394,-0.009283
table,-0.007072,0.181157,0.068872,0.043802,-0.10029,-0.288893,1.0,0.194667,0.181107,0.155177,0.127265
x,0.003929,0.978398,0.004019,0.172107,-0.187997,-0.023214,0.194667,1.0,0.9671,0.990589,0.886668
y,0.006146,0.94717,0.011763,0.166353,-0.179998,-0.027701,0.181107,0.9671,1.0,0.965549,0.860555
z,0.005053,0.97656,0.046539,0.173704,-0.187972,0.097394,0.155177,0.990589,0.965549,1.0,0.88113


In [11]:
diamonds.drop(columns=["x","z","Unnamed: 0"],inplace=True) # esta vez no elimino todas

In [12]:
X=diamonds.drop(columns=["price"])
y=diamonds["price"]

In [14]:
pca = PCA(n_components=7)
XReduced = pd.DataFrame(pca.fit_transform(X))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32262, 7) (8066, 7) (32262,) (8066,)


# RandomForest:

In [54]:
model= RandomForestRegressor(n_estimators=500).fit(X_train, y_train)
y_pred=model.predict(X_test)
print(np.mean(cross_val_score(model, X_train, y_train, scoring='r2', cv=3, n_jobs=2)))
print("r2_score", r2_score(y_test,y_pred))
print("RSME", (mean_squared_error(y_test,y_pred)**0.5))

# n_estimator:1500
#0.9786474761729842
#r2_score 0.9790276359720973
#RSME 583.9631151711887

#100:
#0.9785539470544492
#r2_score 0.979053221526376
#RSME 583.6067991253947

0.9783305822469474
r2_score 0.9786958535541257
RSME 588.5641325112016


In [34]:
diamonds_test = pd.read_csv("../input_diamonds/diamonds_test.csv")
diamonds_test.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.3,Ideal,H,SI2,60.0,56.0,4.41,4.43,2.65
1,1,0.34,Ideal,D,IF,62.1,57.0,4.52,4.46,2.79
2,2,1.57,Very Good,I,VS2,60.3,58.0,7.58,7.55,4.56
3,3,0.31,Ideal,H,VS2,61.8,57.0,4.32,4.36,2.68
4,4,1.51,Good,I,VVS1,64.0,60.0,7.26,7.21,4.63


In [35]:
diamonds_test.drop(columns=['x','z',"Unnamed: 0"],inplace=True)

In [36]:
diamonds_test["cut"].replace(["Premium", "Ideal", "Very Good","Good","Fair"],[1,2,3,4,5],inplace=True)
diamonds_test["color"].replace(["G", "E", "F","H","D","I","J"],[1,2,3,4,5,6,7],inplace=True)
diamonds_test["clarity"].replace(["SI1", "VS2", "SI2","VS1","VVS2","VVS1","IF","I1"],[1,2,3,4,5,6,7,8],inplace=True)

In [37]:
price_pred=model.predict(diamonds_test) 

In [38]:
df_price=pd.DataFrame({'price':price_pred})
df_price.index.rename('id', inplace=True)
df_price.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,428.172
1,1821.57
2,9859.576
3,545.9
4,9287.8


In [40]:
df_price.to_csv("./output/test4.csv")

# GradientBoosting

In [53]:
model2= GradientBoostingRegressor(n_estimators=2000).fit(X_train, y_train)
y_pred=model2.predict(X_test)
print(np.mean(cross_val_score(model2, X_train, y_train, scoring='r2', cv=3, n_jobs=2)))
print("r2_score", r2_score(y_test,y_pred))
print("RSME", (mean_squared_error(y_test,y_pred)**0.5)) 

#700= 631.92
#1000=628
#1500=619

0.9770999909911752
r2_score 0.9765974173560159
RSME 616.8699529634123


In [64]:
price_pred=model2.predict(diamonds_test) 

In [59]:
df2_price=pd.DataFrame({'price':price_pred})
df2_price.index.rename('id', inplace=True)
df2_price.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,272.5689
1,1949.103375
2,9909.539672
3,593.909027
4,10780.397148


In [60]:
df2_price.to_csv("./output/test5.csv")