In [1]:
# imports

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE

In [2]:
diamonds_train = pd.read_csv('../db/diamonds_train.csv')
diamonds_test = pd.read_csv('../db/diamonds_test.csv')
sample_submission = pd.read_csv('../db/sample_submission.csv')

In [3]:
#to see which features are catagorical and numerical 
diamonds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  city     40455 non-null  object 
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [4]:
# categorical features
cols = ['cut', 'color', 'clarity', 'city']
cat_list = []
for col in cols:
    cat = diamonds_train[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,city,13,"[Dubai, Luxembourg, New York City, Antwerp, Ma..."
1,clarity,8,"[VS2, VVS2, SI1, VS1, SI2, I1, VVS1, IF]"
2,color,7,"[J, E, I, G, D, H, F]"
3,cut,5,"[Premium, Very Good, Fair, Good, Ideal]"


In [5]:
target = "price"
cat_features = ['cut','color','clarity','city']
num_features = ['carat','depth','table','x','y','z']

for cat in cat_features:
    diamonds_train[cat_features]=diamonds_train[cat_features].astype('category')
    diamonds_test[cat_features]=diamonds_test[cat_features].astype('category')
    
cat_df = pd.get_dummies(diamonds_train[cat_features])
num_df = diamonds_train.loc[:,num_features]
train_df = pd.concat([cat_df,num_df],axis=1)

cat_df = pd.get_dummies(diamonds_test[cat_features])
num_df = diamonds_test.loc[:,num_features]
test_df = pd.concat([cat_df, num_df], axis=1) 

features = list(cat_df.columns) + list(num_df.columns)

In [6]:
features

['cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2',
 'city_Amsterdam',
 'city_Antwerp',
 'city_Dubai',
 'city_Kimberly',
 'city_Las Vegas',
 'city_London',
 'city_Luxembourg',
 'city_Madrid',
 'city_New York City',
 'city_Paris',
 'city_Surat',
 'city_Tel Aviv',
 'city_Zurich',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z']

In [7]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(train_df.loc[:,features].values)
y = diamonds_train[target]
#scaled_data_df = pd.DataFrame(scaled_data)
#scaled_data_df.columns = features
#scaled_data_df
scaled_data

array([[-0.17611318, -0.3155308 , -0.81809533, ...,  0.97880679,
         0.92198533,  1.02265738],
       [-0.17611318, -0.3155308 , -0.81809533, ...,  0.5963938 ,
         0.53825404,  0.5635818 ],
       [-0.17611318, -0.3155308 , -0.81809533, ...,  0.11615424,
         0.05858994,  0.13319845],
       ...,
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.27120451,
        -1.21470024, -1.21533607],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.40460439,
        -1.31063306, -1.41618164],
       [-0.17611318, -0.3155308 ,  1.22235144, ..., -1.20895123,
        -1.14493091, -1.21533607]])

In [8]:
#aqui es cuando tengo una base de datos y necesito separar mi Xtrain, Xtest, ytrain y ytest para poder revisar mi modelo. En el caso del proyecto no necesito esto porque tengo dos datasets distintos uno para el train y uno para el validation. 
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(scaled_data_df,diamonds_train['price'], test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(scaled_data, y=y)

LinearRegression()

In [10]:
x_test = scaler.transform(test_df.loc[:,features].values)
predictions = model.predict(x_test)

In [11]:
#submisions = pd.DataFrame({"id": diamonds_test["id"], "price": predictions})
#submisions.to_csv("../db/submision.csv", index=False)

In [12]:
##PRUEBAS##

In [13]:
##SEPARACION DE MI DATASET"

In [14]:
#aqui es cuando tengo una base de datos y necesito separar mi Xtrain, Xtest, ytrain y ytest para poder revisar mi modelo. En el caso del proyecto no necesito esto porque tengo dos datasets distintos uno para el train y uno para el validation. 
from sklearn.model_selection import train_test_split
X_train_allcolumns, X_test_allcolumns, y_train, y_test = train_test_split(train_df,diamonds_train['price'], test_size=0.2, random_state=42)

In [15]:
X_train_allcolumns

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,city_Paris,city_Surat,city_Tel Aviv,city_Zurich,carat,depth,table,x,y,z
32121,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0.42,60.1,58.0,4.91,4.87,2.94
9831,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1.82,62.7,58.0,7.75,7.68,4.84
33128,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0.50,62.9,61.0,5.03,4.96,3.14
6199,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0.71,62.0,60.0,5.68,5.64,3.51
19661,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1.01,59.3,64.0,6.47,6.49,3.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1.68,61.6,55.0,7.69,7.64,4.72
11284,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1.00,59.8,58.0,6.49,6.53,3.89
38158,0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0.43,61.3,56.0,4.86,4.90,2.99
860,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0.82,63.2,57.0,5.92,5.97,3.76


In [16]:
X_test_allcolumns

Unnamed: 0,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,...,city_Paris,city_Surat,city_Tel Aviv,city_Zurich,carat,depth,table,x,y,z
17775,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,1.04,61.3,58.0,6.52,6.43,3.97
13506,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0.23,59.6,61.0,3.99,4.03,2.39
4325,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1.30,61.9,56.0,6.97,7.05,4.34
37870,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0.90,61.7,57.0,6.21,6.26,3.85
21321,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0.72,63.4,57.0,5.68,5.70,3.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3781,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0.70,63.6,60.0,5.59,5.51,3.51
26959,0,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0.52,62.8,55.0,5.16,5.13,3.23
15529,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,1.30,62.2,58.0,6.92,6.98,4.32
36333,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0.83,60.2,60.0,6.04,6.09,3.65


In [17]:
y_train_allcolumns=pd.DataFrame(y_train)
y_train_allcolumns.columns = ["price"]
y_train_allcolumns.reset_index()

Unnamed: 0,index,price
0,32121,838
1,9831,15162
2,33128,1845
3,6199,2388
4,19661,4399
...,...,...
32359,6265,12075
32360,11284,7710
32361,38158,739
32362,860,2741


In [18]:
y_test

17775    9039
13506     499
4325     7747
37870    3772
21321    3264
         ... 
3781     2512
26959    1694
15529    8854
36333    3557
31606    4676
Name: price, Length: 8091, dtype: int64

In [19]:
### MODELO 1 ###   LinearRegression, todas las columnas, Scaler: StandardScaler

In [20]:
from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

In [21]:
scaler1 = StandardScaler()
X_train_allcolumns_scaler1 = scaler1.fit_transform(X_train_allcolumns.loc[:,features].values)

In [22]:
scaler2 = MinMaxScaler()
X_train_allcolumns_scaler2 = scaler2.fit_transform(X_train_allcolumns.loc[:,features].values)

In [23]:
scaler3 = MaxAbsScaler()
X_train_allcolumns_scaler3 = scaler3.fit_transform(X_train_allcolumns.loc[:,features].values)

In [24]:
model1 = LinearRegression()
model1.fit(X_train_allcolumns_scaler1,y_train_allcolumns["price"])

LinearRegression()

In [25]:
x_testmodelo1 = scaler1.transform(X_test_allcolumns.loc[:,features].values)
predictions1 = model1.predict(x_testmodelo1)

In [26]:
predictions1

array([7287.05071201, 1015.58323801, 8525.96517937, ..., 8736.3494841 ,
       4629.55861992, 4832.27990877])

In [27]:
check1 = pd.DataFrame({'Ground truth':y_test, 'Predictions':predictions1, 'Diff':y_test-predictions1})
check1

Unnamed: 0,Ground truth,Predictions,Diff
17775,9039,7287.050712,1751.949288
13506,499,1015.583238,-516.583238
4325,7747,8525.965179,-778.965179
37870,3772,3871.218258,-99.218258
21321,3264,3879.090878,-615.090878
...,...,...,...
3781,2512,2904.268218,-392.268218
26959,1694,2211.893134,-517.893134
15529,8854,8736.349484,117.650516
36333,3557,4629.558620,-1072.558620


In [28]:
from sklearn.metrics import mean_squared_error
rmse1 = mean_squared_error(y_test, predictions1)**0.5
rmse1

1160.3777122737868

In [29]:
### MODELO 2 ###   ElasticNet, todas las columnas, Scaler: StandardScaler

In [30]:
model2 = ElasticNet()
model2.fit(X_train_allcolumns_scaler1,y_train_allcolumns["price"])

ElasticNet()

In [31]:
x_testmodelo2 = scaler1.transform(X_test_allcolumns.loc[:,features].values)
predictions2 = model2.predict(x_testmodelo2)

In [32]:
from sklearn.metrics import mean_squared_error
rmse2 = mean_squared_error(y_test, predictions2)**0.5
rmse2

1606.0221983321887

In [33]:
### MODELO 3 ###   Ridge, todas las columnas, Scaler: StandardScaler

In [34]:
model3 = Ridge()
model3.fit(X_train_allcolumns_scaler1,y_train_allcolumns["price"])

Ridge()

In [35]:
x_testmodelo3 = scaler1.transform(X_test_allcolumns.loc[:,features].values)
predictions3 = model3.predict(x_testmodelo3)

In [36]:
from sklearn.metrics import mean_squared_error
rmse3 = mean_squared_error(y_test, predictions3)**0.5
rmse3

1160.2958595969292

In [37]:
### MODELO 4 ###   SGDRegressor, todas las columnas, Scaler: StandardScaler

In [38]:
model4 = SGDRegressor()
model4.fit(X_train_allcolumns_scaler1,y_train_allcolumns["price"])

SGDRegressor()

In [39]:
x_testmodelo4 = scaler1.transform(X_test_allcolumns.loc[:,features].values)
predictions4 = model4.predict(x_testmodelo4)

In [40]:
from sklearn.metrics import mean_squared_error
rmse4 = mean_squared_error(y_test, predictions4)**0.5
rmse4

1166.5746827167702

In [41]:
### MODELO 5 ###   Ridge, todas las columnas, Scaler: MinMaxScaler

In [42]:
model5 = Ridge()
model5.fit(X_train_allcolumns_scaler2,y_train_allcolumns["price"])

Ridge()

In [43]:
x_testmodelo5 = scaler2.transform(X_test_allcolumns.loc[:,features].values)
predictions5 = model5.predict(x_testmodelo5)

In [44]:
from sklearn.metrics import mean_squared_error
rmse5 = mean_squared_error(y_test, predictions5)**0.5
rmse5

1159.1636378012686

In [45]:
### MODELO 6 ###   Ridge, todas las columnas, Scaler: MaxAbsScaler

In [46]:
'''"Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),
    (
        "Data after robust scaling",
        RobustScaler(quantile_range=(25, 75)).fit_transform(X),
    ),
    (
        "Data after power transformation (Yeo-Johnson)",
        PowerTransformer(method="yeo-johnson").fit_transform(X),
    ),
    (
        "Data after power transformation (Box-Cox)",
        PowerTransformer(method="box-cox").fit_transform(X),
    ),
    (
        "Data after quantile transformation (uniform pdf)",
        QuantileTransformer(output_distribution="uniform").fit_transform(X),
    ),
    (
        "Data after quantile transformation (gaussian pdf)",
        QuantileTransformer(output_distribution="normal").fit_transform(X),
    ),
    ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),
]'''

'"Data after max-abs scaling", MaxAbsScaler().fit_transform(X)),\n    (\n        "Data after robust scaling",\n        RobustScaler(quantile_range=(25, 75)).fit_transform(X),\n    ),\n    (\n        "Data after power transformation (Yeo-Johnson)",\n        PowerTransformer(method="yeo-johnson").fit_transform(X),\n    ),\n    (\n        "Data after power transformation (Box-Cox)",\n        PowerTransformer(method="box-cox").fit_transform(X),\n    ),\n    (\n        "Data after quantile transformation (uniform pdf)",\n        QuantileTransformer(output_distribution="uniform").fit_transform(X),\n    ),\n    (\n        "Data after quantile transformation (gaussian pdf)",\n        QuantileTransformer(output_distribution="normal").fit_transform(X),\n    ),\n    ("Data after sample-wise L2 normalizing", Normalizer().fit_transform(X)),\n]'

In [47]:
model6 = Ridge()
model6.fit(X_train_allcolumns_scaler3,y_train_allcolumns["price"])

Ridge()

In [48]:
x_testmodelo6 = scaler3.transform(X_test_allcolumns.loc[:,features].values)
predictions6 = model6.predict(x_testmodelo6)

In [49]:
from sklearn.metrics import mean_squared_error
rmse6 = mean_squared_error(y_test, predictions6)**0.5
rmse6

1159.2085225241472

In [50]:
### MODELO 7 ###   Ridge,columnas = carat y clarity, Scaler: MinMaxScaler

In [51]:
model7 = Ridge()
model7.fit(X_train_allcolumns_scaler2,y_train_allcolumns["price"])

Ridge()

In [52]:
carat_clarity = ["carat",'clarity_I1','clarity_IF','clarity_SI1','clarity_SI2','clarity_VS1','clarity_VS2','clarity_VVS1','clarity_VVS2']
X_train_carat_clarity = X_train_allcolumns[carat_clarity]
X_train_carat_clarity

Unnamed: 0,carat,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
32121,0.42,0,0,0,1,0,0,0,0
9831,1.82,0,0,1,0,0,0,0,0
33128,0.50,0,0,0,0,0,1,0,0
6199,0.71,0,0,1,0,0,0,0,0
19661,1.01,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6265,1.68,0,0,1,0,0,0,0,0
11284,1.00,0,0,0,0,1,0,0,0
38158,0.43,0,0,1,0,0,0,0,0
860,0.82,0,0,1,0,0,0,0,0


In [53]:
X_train_carat_clarity_scaler2 = scaler2.fit_transform(X_train_carat_clarity.loc[:,carat_clarity].values)

In [54]:
model7 = Ridge()
model7.fit(X_train_carat_clarity_scaler2,y_train_allcolumns["price"])

Ridge()

In [55]:
X_test_carat_clarity = X_test_allcolumns[carat_clarity]
x_testmodelo7 = scaler2.transform(X_test_carat_clarity.loc[:,carat_clarity].values)
predictions7 = model7.predict(x_testmodelo7)

In [56]:
from sklearn.metrics import mean_squared_error
rmse7 = mean_squared_error(y_test, predictions7)**0.5
rmse7

1313.77565086706