In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
diamond = train.copy()
diamond.head()


Unnamed: 0,Id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,1.06,Ideal,I,SI2,61.8,57.0,4270,6.57,6.6,4.07
1,2,1.51,Premium,G,VVS2,60.9,58.0,15164,7.38,7.42,4.51
2,3,0.32,Ideal,F,VS2,61.3,56.0,828,4.43,4.41,2.71
3,4,0.53,Ideal,G,VS2,61.2,56.0,1577,5.19,5.22,3.19
4,5,0.7,Premium,H,VVS2,61.0,57.0,2596,5.76,5.72,3.5


In [6]:
from sklearn.model_selection import train_test_split
train_set_2, test_set_2 = train_test_split(train, test_size=0.2, random_state=40)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
num_pipeline = Pipeline([
('std_scaler', StandardScaler()),
])

In [8]:
np.array(diamond.columns)

array(['Id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'], dtype=object)

In [9]:
diamond_num = [item for item in np.array(diamond.columns) if item not in ["cut", "clarity", "color", "price", "Id"]]
diamond_price_train = train_set_2[["price"]]
diamond_price_train

Unnamed: 0,price
18091,9306
7298,2604
8450,12109
34990,11942
16697,4030
...,...
11532,2511
27640,4808
14501,725
30727,761


In [10]:
from sklearn.compose import ColumnTransformer
num_attribs = list(diamond_num)
cat_attribs = ["cut", "clarity", "color"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OrdinalEncoder(), cat_attribs),
])

In [11]:
def remove_strings(main_array, strings_to_remove):
    return np.array([item for item in main_array if item not in strings_to_remove])
train_set_2.head()

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,price,x,y,z
18091,18092,1.5,Very Good,J,VVS2,62.2,57.0,9306,7.25,7.31,4.53
7298,7299,0.74,Ideal,G,VS2,61.3,55.0,2604,5.82,5.89,3.59
8450,8451,1.53,Ideal,F,SI1,61.6,56.0,12109,7.39,7.34,4.54
34990,34991,1.51,Ideal,D,SI1,61.9,57.0,11942,7.42,7.35,4.57
16697,16698,0.71,Ideal,E,VVS2,61.2,57.0,4030,5.73,5.8,3.53


In [12]:
diamond_prepared = full_pipeline.fit_transform(train_set_2.drop(["Id", "price"], axis=1))
diamond_prepared = pd.DataFrame(data=diamond_prepared, columns=remove_strings(train_set_2.columns, ["Id", "price"]))
diamond_prepared.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.48823,0.31776,-0.204541,1.358771,1.364213,1.431487,4.0,7.0,6.0
1,-0.118217,-0.312544,-1.104763,0.083266,0.137679,0.07925,2.0,5.0,3.0
2,1.551643,-0.102443,-0.654652,1.483645,1.390126,1.445872,2.0,2.0,2.0
3,1.509368,0.107659,-0.204541,1.510404,1.398764,1.489029,2.0,2.0,0.0
4,-0.181629,-0.382578,-0.204541,0.002989,0.059941,-0.007063,2.0,7.0,1.0


In [13]:
X=diamond_prepared
y=train_set_2["price"]
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
# desc_tree = LinearRegression()
# desc_tree = SVC(kernel='linear', random_state=40) 
desc_tree = RandomForestRegressor(random_state=42)
desc_tree.fit(X, y)

In [17]:
diamond_prepared_test = full_pipeline.fit_transform(test_set_2.drop(["Id", "price"], axis=1))
diamond_prepared_test = pd.DataFrame(data=diamond_prepared_test, columns=test.drop("Id", axis=1).columns)
# price = desc_tree.predict( diamond_prepared_test.drop(["price"], axis=1))
price = desc_tree.predict( diamond_prepared_test)
id=test_set_2["Id"]
diamond_prepared_test.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,2.596176,-0.034657,-0.207731,2.102999,2.173561,1.934754,2.0,2.0,6.0
1,-0.747795,0.03359,-0.207731,-0.72328,-0.766684,-0.676924,2.0,7.0,3.0
2,-0.537483,-0.580636,-0.207731,-0.41123,-0.389038,-0.426309,2.0,2.0,0.0
3,0.82955,-0.102905,-1.522389,1.006368,0.959698,0.87953,2.0,5.0,6.0
4,-0.537483,-0.853625,0.668707,-0.375567,-0.425004,-0.452689,3.0,2.0,6.0


In [18]:


len(price)

8631

In [19]:
data = {
    'Id': id,
    'price': price,
}
result = pd.DataFrame(data)
result

Unnamed: 0,Id,price
25284,25285,12302.317500
2035,2036,1174.800000
18066,18067,1703.650000
38796,38797,5138.980000
13421,13422,1041.990000
...,...,...
23098,23099,7823.410000
845,846,651.360000
17825,17826,1746.210000
42890,42891,4026.030000


In [20]:
from sklearn.metrics import mean_squared_error
# housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(price, test_set_2["price"])
lin_rmse = np.sqrt(lin_mse)
lin_rmse

591.9144000915304

### Fine-Tuning

In [21]:

full_pipeline_2 = Pipeline([
("random_forest", RandomForestRegressor())
])

In [22]:
from sklearn.model_selection import GridSearchCV


param_grid = [
    {'random_forest__max_features': [10, 50, 100]}
]
grid_search = GridSearchCV(full_pipeline_2, param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(diamond_prepared, train_set_2["price"])

In [23]:
grid_search.best_params_

{'random_forest__max_features': 10}

In [24]:
diamond_prepared_test = full_pipeline.fit_transform(test_set_2.drop(["Id", "price"], axis=1))
diamond_prepared_test = pd.DataFrame(data=diamond_prepared_test, columns=test.drop("Id", axis=1).columns)
# price = desc_tree.predict( diamond_prepared_test.drop(["price"], axis=1))
price = grid_search.predict( diamond_prepared_test)
id=test_set_2["Id"]
diamond_prepared_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,2.596176,-0.034657,-0.207731,2.102999,2.173561,1.934754,2.0,2.0,6.0
1,-0.747795,0.03359,-0.207731,-0.72328,-0.766684,-0.676924,2.0,7.0,3.0
2,-0.537483,-0.580636,-0.207731,-0.41123,-0.389038,-0.426309,2.0,2.0,0.0
3,0.82955,-0.102905,-1.522389,1.006368,0.959698,0.87953,2.0,5.0,6.0
4,-0.537483,-0.853625,0.668707,-0.375567,-0.425004,-0.452689,3.0,2.0,6.0


### Cross Validation 

In [25]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(grid_search, diamond_prepared_test, test_set_2["price"],
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [26]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

Scores: [623.24554311 707.226615   643.53796162 663.27607802 689.44092548
 484.09994091 584.29860602 581.32154797 586.16170262 630.12959901]
Mean: 619.2738519764783
Standard deviation: 61.11441490526957


In [27]:
result.to_csv("forth_version.csv", index=False)

In [28]:
data = {
    'Id': id,
    'price': price,
}
result = pd.DataFrame(data)
result

Unnamed: 0,Id,price
25284,25285,12310.399000
2035,2036,1153.860000
18066,18067,1700.640000
38796,38797,5135.710000
13421,13422,1046.090000
...,...,...
23098,23099,7844.600000
845,846,640.760000
17825,17826,1751.460000
42890,42891,3930.870000


In [29]:
from sklearn.metrics import mean_squared_error
# housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(price, test_set_2["price"])
lin_rmse = np.sqrt(lin_mse)
lin_rmse

589.74333343794