In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
import os
from sys import platform
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

 Table

# Instructions

1. Load the `train.csv` file
2. Explore the data, understand it
3. Process it for future training
4. Do train, test, split for your `train.csv` file
5. `fit/train` a model from your cleaned_train_df
-----
5. Load the `test.csv` file
6. Apply the same processing you did to `train.csv` into `test.csv`
7. `predict` the price for that file
8. Only keep the columns you need
9. Export
-----
10. Repeat! 🚀🔥

# Import the csv files

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
12639,12639,0.76,Premium,E,VVS2,60.6,59.0,5.94,5.85,3.57,8.234


In [4]:
df_test.sample()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
7466,7466,1.7,Good,G,VS1,63.5,56.0,7.53,7.64,4.82


In [5]:
df_train.shape

(40455, 11)

In [6]:
df_test.shape

(13485, 10)

In [7]:
#SUBMISSION -> test

In [8]:
print(df_train.shape[0])
df_train.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
2440,2440,0.51,Premium,E,SI1,61.8,58.0,5.15,5.11,3.17,7.274


In [9]:
print(df_test.shape[0])
df_test.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
6969,6969,1.0,Fair,F,VS2,68.4,58.0,5.97,6.1,4.13


In [5]:
#According to this website: https://www.washingtondiamond.com/pages/diamond-price-calculator the most important aspects to determine
#The price of the diamond are: Shape (we don't have it), Carat, Color, Clarity and finally the depth will tell you how well is cut

# Cleaning, processing, feature selection, etc

In [22]:
# Processing is necessary, otherwise we won't be able to fit a model
# For the sake of the example, we'll just drop categorical columns

df_train_cleaned = df_train.drop(columns = ['id', 'x', 'y', 'z', 'price'])
df_train_cleaned[['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']] = pd.get_dummies(df_train_cleaned.cut, dtype="int")
df_train_cleaned[['F', 'E', 'G', 'H', 'J', 'I', 'D']] = pd.get_dummies(df_train_cleaned.color, dtype="int")
df_train_cleaned[['SI1', 'VS2', 'VVS1', 'VVS2', 'SI2', 'VS1', 'I1', 'IF']] = pd.get_dummies(df_train_cleaned.clarity, dtype="int")
df_train_cleaned = df_train_cleaned.drop(columns = ['cut', 'color', 'clarity'])
df_train_cleaned

Unnamed: 0,carat,depth,table,Ideal,Premium,Good,Very Good,Fair,F,E,...,I,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF
0,0.30,60.2,56.0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0.51,61.5,58.0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,1.59,62.1,58.0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.70,62.6,57.0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,1.01,63.1,59.0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.50,61.9,58.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
40451,1.07,62.2,57.0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
40452,1.42,62.7,55.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
40453,0.42,61.3,56.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [115]:
df_train_cleaned_2 = df_train.drop(columns = ['id','price'])
df_train_cleaned_2[['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']] = pd.get_dummies(df_train_cleaned_2.cut, dtype="int")
df_train_cleaned_2[['F', 'E', 'G', 'H', 'J', 'I', 'D']] = pd.get_dummies(df_train_cleaned_2.color, dtype="int")
df_train_cleaned_2[['SI1', 'VS2', 'VVS1', 'VVS2', 'SI2', 'VS1', 'I1', 'IF']] = pd.get_dummies(df_train_cleaned_2.clarity, dtype="int")
df_train_cleaned_2 = df_train_cleaned_2.drop(columns = ['cut', 'color', 'clarity'])
df_train_cleaned_2

Unnamed: 0,carat,depth,table,x,y,z,Ideal,Premium,Good,Very Good,...,I,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF
0,0.30,60.2,56.0,4.36,4.41,2.64,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,0.51,61.5,58.0,5.14,5.11,3.15,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,1.59,62.1,58.0,7.45,7.43,4.62,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0.70,62.6,57.0,5.63,5.68,3.54,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1.01,63.1,59.0,6.31,6.34,3.99,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,0.50,61.9,58.0,5.12,5.09,3.16,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
40451,1.07,62.2,57.0,6.59,6.56,4.09,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
40452,1.42,62.7,55.0,7.11,7.17,4.48,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
40453,0.42,61.3,56.0,4.81,4.82,2.95,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [116]:
X = df_train_cleaned_2
y = df_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [117]:
grad = GradientBoostingRegressor(n_estimators=4000)
grad.fit(X_train, y_train)
y_pred = grad.predict(X_test)

# MAE, MSE, RMSE, R2
mae = metrics.mean_absolute_error(y_pred, y_test)
mse = metrics.mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_pred, y_test)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"r2: {r2}")

MAE: 0.06418457985562881
MSE: 0.00795351736593273
RMSE: 0.08918249472812885
r2: 0.9922187240628632


In [118]:
df_test_cleaned_2 = df_test.drop(columns = ['id'])
df_test_cleaned_2[['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']] = pd.get_dummies(df_test_cleaned_2.cut, dtype="int")
df_test_cleaned_2[['F', 'E', 'G', 'H', 'J', 'I', 'D']] = pd.get_dummies(df_test_cleaned_2.color, dtype="int")
df_test_cleaned_2[['SI1', 'VS2', 'VVS1', 'VVS2', 'SI2', 'VS1', 'I1', 'IF']] = pd.get_dummies(df_test_cleaned_2.clarity, dtype="int")
df_test_cleaned_2 = df_test_cleaned_2.drop(columns = ['cut', 'color', 'clarity'])
df_test_cleaned_2

Unnamed: 0,carat,depth,table,x,y,z,Ideal,Premium,Good,Very Good,...,I,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF
0,0.51,63.3,57.0,5.07,5.10,3.22,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.36,59.5,56.0,4.63,4.68,2.77,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.70,60.5,56.0,5.73,5.80,3.49,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1.05,59.0,59.0,6.71,6.68,3.95,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.50,59.4,60.0,5.21,5.16,3.08,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,5.52,5.54,3.35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
13481,1.22,61.8,58.0,6.84,6.91,4.25,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
13482,0.33,61.5,60.0,4.41,4.44,2.72,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13483,2.17,60.4,56.0,8.36,8.43,5.07,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [119]:
y_pred = grad.predict(df_test_cleaned_2)
y_pred

array([7.54885798, 6.54790618, 7.79731678, ..., 6.40552245, 9.74480831,
       7.32182384])

In [120]:
df_test_cleaned_2['price'] = y_pred # Adding the predicted price
df_test_cleaned_2

Unnamed: 0,carat,depth,table,x,y,z,Ideal,Premium,Good,Very Good,...,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF,price
0,0.51,63.3,57.0,5.07,5.10,3.22,0,0,0,0,...,0,0,0,0,0,1,0,0,0,7.548858
1,0.36,59.5,56.0,4.63,4.68,2.77,0,0,1,0,...,0,0,0,1,0,0,0,0,0,6.547906
2,0.70,60.5,56.0,5.73,5.80,3.49,0,0,0,0,...,0,0,0,1,0,0,0,0,0,7.797317
3,1.05,59.0,59.0,6.71,6.68,3.95,0,0,0,1,...,0,0,0,0,0,0,1,0,0,8.627821
4,0.50,59.4,60.0,5.21,5.16,3.08,0,0,0,1,...,0,0,0,0,1,0,0,0,0,7.080221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,5.52,5.54,3.35,0,0,0,0,...,0,0,0,0,0,0,0,0,1,7.661067
13481,1.22,61.8,58.0,6.84,6.91,4.25,0,0,0,1,...,0,0,0,0,0,0,1,0,0,8.992244
13482,0.33,61.5,60.0,4.41,4.44,2.72,0,0,0,0,...,0,0,0,1,0,0,0,0,0,6.405522
13483,2.17,60.4,56.0,8.36,8.43,5.07,0,0,1,0,...,0,0,0,1,0,0,0,0,0,9.744808


In [121]:
df_test_cleaned_2['id'] = df_test['id']
df_test_cleaned_2

Unnamed: 0,carat,depth,table,x,y,z,Ideal,Premium,Good,Very Good,...,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF,price,id
0,0.51,63.3,57.0,5.07,5.10,3.22,0,0,0,0,...,0,0,0,0,1,0,0,0,7.548858,0
1,0.36,59.5,56.0,4.63,4.68,2.77,0,0,1,0,...,0,0,1,0,0,0,0,0,6.547906,1
2,0.70,60.5,56.0,5.73,5.80,3.49,0,0,0,0,...,0,0,1,0,0,0,0,0,7.797317,2
3,1.05,59.0,59.0,6.71,6.68,3.95,0,0,0,1,...,0,0,0,0,0,1,0,0,8.627821,3
4,0.50,59.4,60.0,5.21,5.16,3.08,0,0,0,1,...,0,0,0,1,0,0,0,0,7.080221,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,5.52,5.54,3.35,0,0,0,0,...,0,0,0,0,0,0,0,1,7.661067,13480
13481,1.22,61.8,58.0,6.84,6.91,4.25,0,0,0,1,...,0,0,0,0,0,1,0,0,8.992244,13481
13482,0.33,61.5,60.0,4.41,4.44,2.72,0,0,0,0,...,0,0,1,0,0,0,0,0,6.405522,13482
13483,2.17,60.4,56.0,8.36,8.43,5.07,0,0,1,0,...,0,0,1,0,0,0,0,0,9.744808,13483


In [122]:
df_for_submission = df_test_cleaned_2[["id", "price"]] # Modifying for subnmission

# Train on train.csv

![](https://builtin.com/sites/www.builtin.com/files/styles/ckeditor_optimize/public/inline-images/4_train-test-split.jpg)

## Train, test split

In [23]:
X = df_train_cleaned
y = df_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Fit

In [25]:
'''
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Just for feedback
if platform == "darwin":
    os.system("say I'm done training")
'''

'\nregressor = LinearRegression()\nregressor.fit(X_train, y_train)\n\n#\xa0Just for feedback\nif platform == "darwin":\n    os.system("say I\'m done training")\n'

In [77]:
rfr = RandomForestRegressor(n_estimators=200)

In [106]:
grad = GradientBoostingRegressor(n_estimators=4000)

In [107]:
grad.fit(X_train, y_train)
y_pred = grad.predict(X_test)

In [78]:
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [108]:
# MAE, MSE, RMSE, R2
mae = metrics.mean_absolute_error(y_pred, y_test)
mse = metrics.mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_pred, y_test)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"r2: {r2}")

MAE: 0.07364375040848514
MSE: 0.009639915423609447
RMSE: 0.09818307096240902
r2: 0.9904301457185626


In [14]:
np.sqrt(mean_squared_error(y_pred, y_test))

0.2723706311520958

In [63]:
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

models = {
    "lr": LinReg(),
    "ridge": Ridge(), #lr similar
    "lasso": Lasso(), # lr similar 
    "sgd": SGDRegressor(),
    "knn": KNeighborsRegressor(),
    "grad": GradientBoostingRegressor(),
    "svr": SVR() #potato chip
}

for model in models.values():
    print(f"Training: {model}")
    model.fit(X_train, y_train)

Training: LinearRegression()
Training: Ridge()
Training: Lasso()
Training: SGDRegressor()
Training: KNeighborsRegressor()
Training: GradientBoostingRegressor()
Training: SVR()


In [64]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------------{name}------------\n")
    print(f"MAE, error: {metrics.mean_absolute_error(y_test, y_pred)}")
    print(f"MSE, error: {metrics.mean_squared_error(y_test, y_pred)}")
    print(f"RMSE, error: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")
    print(f"r2: {metrics.r2_score(y_test, y_pred)}")
    print("\n")

------------lr------------

MAE, error: 0.26474893982125164
MSE, error: 0.11002242403296118
RMSE, error: 0.33169628281450664
r2: 0.8919268174251923


------------ridge------------

MAE, error: 0.2647663643753731
MSE, error: 0.11002423614080722
RMSE, error: 0.3316990143802167
r2: 0.8919250374220355


------------lasso------------

MAE, error: 0.870934783202614
MSE, error: 1.0182554745394754
RMSE, error: 1.0090864554335646
r2: -0.00021528133877435351


------------sgd------------

MAE, error: 111727239.09671853
MSE, error: 1.7159709375912836e+16
RMSE, error: 130995073.86124425
r2: -1.6855694833246756e+16


------------knn------------

MAE, error: 0.35316719812136943
MSE, error: 0.2502065386281054
RMSE, error: 0.5002064959875125
r2: 0.75422631187925


------------grad------------

MAE, error: 0.09558002942867772
MSE, error: 0.015020675367719094
RMSE, error: 0.12255886490874128
r2: 0.9852454424115752


------------svr------------

MAE, error: 0.3279918295724447
MSE, error: 0.17053193355774

-----
-----
-----
-----


# Applying same cleaning & processing to my `test.csv`

In [38]:
df_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.51,Very Good,D,VS1,63.3,57.0,5.07,5.10,3.22
1,1,0.36,Ideal,D,SI1,59.5,56.0,4.63,4.68,2.77
2,2,0.70,Very Good,F,SI1,60.5,56.0,5.73,5.80,3.49
3,3,1.05,Premium,H,VS2,59.0,59.0,6.71,6.68,3.95
4,4,0.50,Premium,E,SI2,59.4,60.0,5.21,5.16,3.08
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.61,Very Good,H,VVS2,60.5,57.0,5.52,5.54,3.35
13481,13481,1.22,Premium,G,VS2,61.8,58.0,6.84,6.91,4.25
13482,13482,0.33,Very Good,D,SI1,61.5,60.0,4.41,4.44,2.72
13483,13483,2.17,Ideal,I,SI1,60.4,56.0,8.36,8.43,5.07


In [109]:
df_test_cleaned = df_test.drop(columns = ['id', 'x', 'y', 'z'])
df_test_cleaned[['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']] = pd.get_dummies(df_test_cleaned.cut, dtype="int")
df_test_cleaned[['F', 'E', 'G', 'H', 'J', 'I', 'D']] = pd.get_dummies(df_test_cleaned.color, dtype="int")
df_test_cleaned[['SI1', 'VS2', 'VVS1', 'VVS2', 'SI2', 'VS1', 'I1', 'IF']] = pd.get_dummies(df_test_cleaned.clarity, dtype="int")
df_test_cleaned = df_test_cleaned.drop(columns = ['cut', 'color', 'clarity'])
df_test_cleaned

Unnamed: 0,carat,depth,table,Ideal,Premium,Good,Very Good,Fair,F,E,...,I,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF
0,0.51,63.3,57.0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0.36,59.5,56.0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,0.70,60.5,56.0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1.05,59.0,59.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.50,59.4,60.0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
13481,1.22,61.8,58.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13482,0.33,61.5,60.0,0,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
13483,2.17,60.4,56.0,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


# Predict on the `test.csv`

In [110]:
y_pred = grad.predict(df_test_cleaned)
y_pred

array([7.54242994, 6.63179263, 7.80439679, ..., 6.39885389, 9.5844856 ,
       7.32508193])

In [42]:
y_pred

array([7.5048375 , 6.69783625, 7.7567075 , ..., 6.448805  , 9.69405   ,
       7.33371086])

# DF with two columns

In [46]:
print(df_test_cleaned.shape[0])
df_test_cleaned.sample()

13485


Unnamed: 0,carat,depth,table,Ideal,Premium,Good,Very Good,Fair,F,E,...,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF,price
6749,1.71,62.1,59.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,9.538145


In [111]:
df_test_cleaned['price'] = y_pred # Adding the predicted price
df_test_cleaned

Unnamed: 0,carat,depth,table,Ideal,Premium,Good,Very Good,Fair,F,E,...,D,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF,price
0,0.51,63.3,57.0,0,0,0,0,1,1,0,...,0,0,0,0,0,1,0,0,0,7.542430
1,0.36,59.5,56.0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,6.631793
2,0.70,60.5,56.0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,7.804397
3,1.05,59.0,59.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,8.594124
4,0.50,59.4,60.0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,7.037796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,7.598519
13481,1.22,61.8,58.0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,8.966457
13482,0.33,61.5,60.0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,6.398854
13483,2.17,60.4,56.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,9.584486


In [112]:
df_test_cleaned['id'] = df_test['id']
df_test_cleaned

Unnamed: 0,carat,depth,table,Ideal,Premium,Good,Very Good,Fair,F,E,...,SI1,VS2,VVS1,VVS2,SI2,VS1,I1,IF,price,id
0,0.51,63.3,57.0,0,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,7.542430,0
1,0.36,59.5,56.0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,6.631793,1
2,0.70,60.5,56.0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,7.804397,2
3,1.05,59.0,59.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,8.594124,3
4,0.50,59.4,60.0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,7.037796,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.61,60.5,57.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,7.598519,13480
13481,1.22,61.8,58.0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,8.966457,13481
13482,0.33,61.5,60.0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,6.398854,13482
13483,2.17,60.4,56.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,9.584486,13483


In [113]:
df_for_submission = df_test_cleaned[["id", "price"]] # Modifying for subnmission

In [50]:
print(df_for_submission.shape[0])
df_for_submission.sample()

13485


Unnamed: 0,id,price
1580,1580,9.333817


# Export (index=False)

In [123]:
df_for_submission.to_csv("my_submission_mariano_ver4.csv", index=False)

In [52]:
df_for_submission.head()

Unnamed: 0,id,price
0,0,7.504838
1,1,6.697836
2,2,7.756707
3,3,8.5366
4,4,7.09369


- Training & testing
    - Pre-process on df_train
    - Fit on df_train to get the model
    - Look at the error

- Predicting
    - Pre-process on df_test
    - Generate y_pred = price on df_test

- Submission
    - Create a DF with only ID & price