In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
import os
from sys import platform
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression as LinReg
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [27]:
df_train = pd.read_csv("train.csv")
df_train.sample(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
23377,23377,0.3,Premium,E,SI1,59.8,60.0,4.31,4.35,2.59,6.265
10080,10080,0.43,Very Good,F,SI2,60.5,59.0,4.88,4.91,2.96,6.521
26574,26574,0.76,Ideal,F,SI1,62.6,56.0,5.82,5.88,3.66,7.908
33510,33510,1.51,Very Good,E,VS1,59.5,59.0,7.48,7.54,4.47,9.688
27429,27429,0.31,Ideal,G,VVS2,60.9,56.0,4.4,4.43,2.69,6.532
1641,1641,0.38,Ideal,I,SI1,61.8,54.0,4.66,4.7,2.89,6.385
243,243,0.51,Good,H,SI1,63.7,56.0,5.07,5.1,3.24,7.02
11842,11842,1.02,Premium,D,SI1,62.5,59.0,6.4,6.34,3.98,8.699
183,183,0.57,Very Good,F,SI1,58.6,57.0,5.37,5.59,3.21,7.218
21717,21717,0.9,Fair,G,I1,65.1,58.0,6.05,5.99,3.92,7.57


In [28]:
print(df_train.cut.unique())
print(df_train.color.unique())
print(df_train.clarity.unique())

['Good' 'Very Good' 'Premium' 'Ideal' 'Fair']
['D' 'H' 'E' 'G' 'F' 'I' 'J']
['VS2' 'VVS2' 'VVS1' 'VS1' 'SI1' 'SI2' 'I1' 'IF']


In [29]:
# Función Get dummies de columnas categóricas:
def get_dummies(df, column):
    df[df[column].unique()] = pd.get_dummies(df[column], dtype = "int")
    df.drop(columns=column, inplace = True)
    return df

In [30]:
df_train = get_dummies(df_train, 'cut')
df_train = get_dummies(df_train, 'color')
df_train = get_dummies(df_train, 'clarity')
df_train

Unnamed: 0,id,carat,depth,table,x,y,z,price,Good,Very Good,...,I,J,VS2,VVS2,VVS1,VS1,SI1,SI2,I1,IF
0,0,1.02,63.2,58.0,6.36,6.40,4.03,8.928,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,0.35,61.0,57.0,4.54,4.57,2.77,6.477,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,0.31,60.5,58.0,4.43,4.40,2.67,6.810,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,0.38,61.4,56.0,4.66,4.69,2.87,6.824,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,1.64,61.8,56.0,7.59,7.60,4.69,9.776,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,1.20,62.2,55.0,6.77,6.81,4.23,9.149,0,0,...,0,0,0,0,0,0,0,0,0,1
40451,40451,1.50,64.2,56.0,7.30,7.09,4.62,9.077,0,1,...,0,0,0,0,0,0,1,0,0,0
40452,40452,1.06,61.9,55.0,6.54,6.58,4.06,8.892,0,0,...,0,0,0,0,0,0,0,1,0,0
40453,40453,0.31,60.1,58.0,4.40,4.38,2.64,6.385,0,0,...,0,0,0,0,0,1,0,0,0,0


In [31]:
# Normalizamos "depth" y "table" ya que no tienen muchos outliers:

def normalizar(df, column):
    min_max_scaler = MinMaxScaler()
    df[column] = min_max_scaler.fit_transform(df[column].values.reshape(-1, 1))
    return df_train

In [32]:
df_train = normalizar(df_train, 'depth')
df_train = normalizar(df_train, 'table')

In [33]:
x_train = df_train.drop(columns='price')
print(x_train)
y_train = df_train.price
print(y_train)

          id  carat     depth     table     x     y     z  Good  Very Good  \
0          0   1.02  0.561111  0.288462  6.36  6.40  4.03     0          1   
1          1   0.35  0.500000  0.269231  4.54  4.57  2.77     0          0   
2          2   0.31  0.486111  0.288462  4.43  4.40  2.67     0          0   
3          3   0.38  0.511111  0.250000  4.66  4.69  2.87     0          0   
4          4   1.64  0.522222  0.250000  7.59  7.60  4.69     0          0   
...      ...    ...       ...       ...   ...   ...   ...   ...        ...   
40450  40450   1.20  0.533333  0.230769  6.77  6.81  4.23     0          0   
40451  40451   1.50  0.588889  0.250000  7.30  7.09  4.62     0          1   
40452  40452   1.06  0.525000  0.230769  6.54  6.58  4.06     0          0   
40453  40453   0.31  0.475000  0.288462  4.40  4.38  2.64     0          0   
40454  40454   2.00  0.494444  0.288462  7.99  8.07  4.88     0          0   

       Premium  ...  I  J  VS2  VVS2  VVS1  VS1  SI1  SI2  I1  

In [34]:
df_test = pd.read_csv('test.csv')
df_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.81,Ideal,F,SI1,61.5,57.0,6.01,6.06,3.71
1,1,0.50,Fair,F,I1,63.8,58.0,5.08,4.97,3.21
2,2,0.31,Ideal,D,VVS2,60.1,56.0,4.43,4.46,2.67
3,3,1.52,Fair,I,SI2,64.7,58.0,7.19,7.22,4.66
4,4,0.35,Premium,D,VVS1,60.8,58.0,4.55,4.53,2.76
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.52,Ideal,D,VVS2,61.4,56.0,5.23,5.20,3.20
13481,13481,1.01,Very Good,E,VS2,59.3,59.0,6.50,6.56,3.87
13482,13482,1.50,Premium,H,VS2,60.6,61.0,7.34,7.31,4.44
13483,13483,0.40,Ideal,E,VVS2,62.5,54.0,4.75,4.76,2.97


In [35]:
df_test = get_dummies(df_test, 'cut')
df_test = get_dummies(df_test, 'color')
df_test = get_dummies(df_test, 'clarity')

In [36]:
df_test = normalizar(df_test, 'depth')
df_test = normalizar(df_test, 'table')

In [37]:
x_test = df_test.drop(columns='price')
y_test = df_test.price

In [38]:
models = {
    "lr": LinReg(),
    "ridge": Ridge(), #lr similar
    "lasso": Lasso(), # lr similar 
    "sgd": SGDRegressor(),
    "knn": KNeighborsRegressor(),
    "grad": GradientBoostingRegressor(),
    "svr": SVR(), #potato chip
    "disc_analysis": LinearDiscriminantAnalysis(),
    "knc": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "svc": SVC()
}

In [39]:
for name, model in models.items():
    print(f"------------------")
    print(model)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f"MAE, error: {metrics.mean_absolute_error(y_test, y_pred)}")
    print(f"MSE, error: {metrics.mean_squared_error(y_test, y_pred)}")
    print(f"RMSE, error: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")
    print(f"r2: {metrics.r2_score(y_test, y_pred)}")
    try:
        print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
        print(f"Precision: {precision_score(y_test, y_pred)}")
        print(f"Recall: {recall_score(y_test, y_pred)}")
        print(f"F1 score: {f1_score(y_test, y_pred)}")
    except:
        print("ValueError: continuous is not supported")
        pass

------------------
LinearRegression()
MAE, error: 0.10841626291295188
MSE, error: 0.027256239295410697
RMSE, error: 0.1650946373914389
r2: 0.9735045152536721
ValueError: continuous is not supported
------------------
Ridge()
MAE, error: 0.10851411075501294
MSE, error: 0.02725986231611022
RMSE, error: 0.16510560958401813
r2: 0.9735009933558549
ValueError: continuous is not supported
------------------
Lasso()
MAE, error: 0.8092177266860174
MSE, error: 0.8780698136360914
RMSE, error: 0.937053794419558
r2: 0.1464381750815028
ValueError: continuous is not supported
------------------
SGDRegressor()
MAE, error: 9.839571667493182e+16
MSE, error: 1.290860993976328e+34
RMSE, error: 1.1361606373996275e+17
r2: -1.2548315049937291e+34
ValueError: continuous is not supported
------------------
KNeighborsRegressor()


KeyboardInterrupt: 

In [95]:
regressor = GradientBoostingRegressor()
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
y_pred
print(f"RMSE, error: {np.sqrt(metrics.mean_squared_error(y_test, y_pred))}")
print(f"r2: {metrics.r2_score(y_test, y_pred)}")

RMSE, error: 0.11941681403657392
r2: 0.9861376488234384


In [92]:
df_test

Unnamed: 0,id,carat,depth,table,x,y,z,price,Good,Very Good,...,J,VS2,VVS2,VVS1,VS1,SI1,SI2,I1,IF,y_pred
0,0,1.02,0.561111,0.288462,6.36,6.40,4.03,8.787268,0,1,...,0,0,0,0,0,0,1,0,0,8.787268
1,1,0.35,0.500000,0.269231,4.54,4.57,2.77,6.598849,0,0,...,0,0,0,0,0,0,0,0,1,6.598849
2,2,0.31,0.486111,0.288462,4.43,4.40,2.67,6.693001,0,0,...,0,0,0,0,0,0,0,1,0,6.693001
3,3,0.38,0.511111,0.250000,4.66,4.69,2.87,6.859630,0,0,...,0,0,0,0,0,1,0,0,0,6.859630
4,4,1.64,0.522222,0.250000,7.59,7.60,4.69,9.607236,0,0,...,0,0,0,0,0,0,0,0,1,9.607236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,1.20,0.533333,0.230769,6.77,6.81,4.23,9.149737,0,0,...,0,0,0,0,0,0,0,0,1,9.149737
40451,40451,1.50,0.588889,0.250000,7.30,7.09,4.62,9.133419,0,1,...,0,0,0,0,0,1,0,0,0,9.133419
40452,40452,1.06,0.525000,0.230769,6.54,6.58,4.06,8.815277,0,0,...,0,0,0,0,0,0,1,0,0,8.815277
40453,40453,0.31,0.475000,0.288462,4.40,4.38,2.64,6.320321,0,0,...,0,0,0,0,1,0,0,0,0,6.320321


In [93]:
df_for_submission = df_test['id']
df_for_submission['price'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_for_submission['price'] = y_pred


In [94]:
df_test["price"] = y_pred

In [80]:
df_test

Unnamed: 0,id,carat,depth,table,x,y,z,price,Good,Very Good,...,J,VS2,VVS2,VVS1,VS1,SI1,SI2,I1,IF,y_pred
0,0,1.02,0.561111,0.288462,6.36,6.40,4.03,8.787268,0,1,...,0,0,0,0,0,0,1,0,0,8.787268
1,1,0.35,0.500000,0.269231,4.54,4.57,2.77,6.598849,0,0,...,0,0,0,0,0,0,0,0,1,6.598849
2,2,0.31,0.486111,0.288462,4.43,4.40,2.67,6.693001,0,0,...,0,0,0,0,0,0,0,1,0,6.693001
3,3,0.38,0.511111,0.250000,4.66,4.69,2.87,6.859630,0,0,...,0,0,0,0,0,1,0,0,0,6.859630
4,4,1.64,0.522222,0.250000,7.59,7.60,4.69,9.607236,0,0,...,0,0,0,0,0,0,0,0,1,9.607236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,1.20,0.533333,0.230769,6.77,6.81,4.23,9.149737,0,0,...,0,0,0,0,0,0,0,0,1,9.149737
40451,40451,1.50,0.588889,0.250000,7.30,7.09,4.62,9.133419,0,1,...,0,0,0,0,0,1,0,0,0,9.133419
40452,40452,1.06,0.525000,0.230769,6.54,6.58,4.06,8.815277,0,0,...,0,0,0,0,0,0,1,0,0,8.815277
40453,40453,0.31,0.475000,0.288462,4.40,4.38,2.64,6.320321,0,0,...,0,0,0,0,1,0,0,0,0,6.320321


In [87]:
df_new = df_test[["id", "y_pred"]]
df_new.rename(columns={"y_pred": "price"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.rename(columns={"y_pred": "price"}, inplace=True)


In [89]:
df_new

Unnamed: 0,id,price
0,0,8.787268
1,1,6.598849
2,2,6.693001
3,3,6.859630
4,4,9.607236
...,...,...
40450,40450,9.149737
40451,40451,9.133419
40452,40452,8.815277
40453,40453,6.320321


In [90]:
df_new.to_csv("prueba.csv", index=False)