In [72]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np

In [73]:
data = pd.read_csv('../kaggle-competition/Input/data.csv')

# CLEANING

In [74]:
columns = ['carat','depth', 'table', 'x', 'y', 'z']
for col in columns:
    data[col] = (data[col] - np.mean(data[col])) / np.std(data[col]) 
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,3.081321,Ideal,G,SI2,0.10617,-0.205764,2.411028,2.276274,2.339667,12831
1,3.439708,Very Good,H,SI2,1.013798,-0.205764,2.51787,2.397719,2.607386,16170
2,0.003405,Premium,F,SI2,-0.522188,-0.205764,0.265287,0.237737,0.183828,2797
3,-0.839859,Ideal,F,I1,1.083616,1.138849,-0.936684,-0.950686,-0.830684,630
4,-1.029593,Ideal,G,VS2,-0.103283,-1.102173,-1.194885,-1.184901,-1.182945,698


In [75]:
data.clarity.unique()

array(['SI2', 'I1', 'VS2', 'SI1', 'VS1', 'VVS2', 'IF', 'VVS1'],
      dtype=object)

In [76]:
data.cut = data.cut.replace({'Ideal':1, 'Premium':2, 'Very Good':3, 'Good':4, 'Fair':5})
data.color = data.color.replace({'D':1, 'E':2, 'F':3, 'G':4, 'H':5, 'I':6, 'J':7})
data.clarity = data.clarity.replace({'IF':1,'VVS1':2,'VVS2':3, 'VS1':4, 'VS2':5, 'SI1':6, 'SI2':7, 'I1':8})

In [77]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,3.081321,1,4,7,0.106170,-0.205764,2.411028,2.276274,2.339667,12831
1,3.439708,3,5,7,1.013798,-0.205764,2.517870,2.397719,2.607386,16170
2,0.003405,2,3,7,-0.522188,-0.205764,0.265287,0.237737,0.183828,2797
3,-0.839859,1,3,8,1.083616,1.138849,-0.936684,-0.950686,-0.830684,630
4,-1.029593,1,4,5,-0.103283,-1.102173,-1.194885,-1.184901,-1.182945,698
...,...,...,...,...,...,...,...,...,...,...
40450,0.656935,2,5,6,0.734528,1.587054,0.799497,0.714842,0.846079,5315
40451,-0.144166,1,3,5,0.594893,-0.653969,0.033797,0.003522,0.085195,2762
40452,0.973160,3,6,4,-1.778904,1.138849,1.209057,1.113874,0.902441,6855
40453,-0.165247,1,4,7,-0.242918,-0.653969,0.024893,0.081594,0.028833,2297


# LINEAR REGRESSION

In [78]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [79]:
columns = ['carat', 'depth', 'table','cut', 'color','clarity', 'x', 'y', 'z']
X = data[columns]
y = data['price']

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# RANDOM FOREST

In [81]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification

In [82]:
clf = RandomForestRegressor(n_estimators=250)

In [83]:
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=250,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [84]:
y_predr = clf.predict(X_test)

In [85]:
r2_score(y_test, y_predr)

0.9808254271274726

In [86]:
#0.9798191813341308

In [87]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [88]:
rmse = sqrt(mean_squared_error(y_test, y_predr))
rmse

565.4929966367157

# TEST

In [42]:
test = pd.read_csv('../kaggle-competition/Input/test.csv')

In [43]:
columns = ['carat','depth', 'table', 'x', 'y', 'z']
for col in columns:
    test[col] = (test[col] - np.mean(test[col])) / np.std(test[col]) 
test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,-0.500241,Ideal,I,VS2,0.241489,-1.537773,-0.383443,-0.360777,-0.340759
1,1,0.535739,Ideal,G,VS2,0.101972,-0.646693,0.744052,0.71177,0.740676
2,2,-0.627096,Premium,E,VS2,-0.177064,-0.646693,-0.553462,-0.595116,-0.585884
3,3,0.007178,Ideal,F,VS1,0.729801,-0.646693,0.162408,0.207042,0.279264
4,4,-0.542526,Ideal,G,VS1,-0.246823,-0.201152,-0.446082,-0.477947,-0.48495


In [44]:
test.cut = test.cut.replace({'Ideal':1, 'Premium':2, 'Very Good':3, 'Good':4, 'Fair':5})
test.color = test.color.replace({'G':1, 'H':2, 'F':3, 'D':4,'E':5, 'I':6, 'J':7})
test.clarity = test.clarity.replace({'IF':1,'VVS1':2,'VVS2':3, 'VS1':4, 'VS2':5, 'SI1':6, 'SI2':7, 'I1':8})

In [45]:
columns = ['carat', 'cut', 'color','clarity', 'depth', 'table']
Xt = test[columns]

In [46]:
y_predra = clf.predict(Xt)

In [47]:
predictions = pd.DataFrame({
    'id': test['id'],
    'price': y_predra
})

In [48]:
predictions

Unnamed: 0,id,price
0,0,2417.285133
1,1,8084.266000
2,2,1965.657000
3,3,3883.596000
4,4,2266.080333
...,...,...
13480,13480,3354.655333
13481,13481,2446.512200
13482,13482,14961.876000
13483,13483,865.566600


In [49]:
predictions.to_csv('prueba5.csv', index=False)