In [326]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.metrics import mean_squared_error, r2_score

In [327]:
url = 'https://raw.githubusercontent.com/Jacoposigno1999/interviews-data-science-assignment/main/datasets/diamonds/diamonds.csv'
df = pd.read_csv(url)
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.10,Ideal,H,SI2,62.0,55.0,4733,6.61,6.65,4.11
1,1.29,Ideal,H,SI1,62.6,56.0,6424,6.96,6.93,4.35
2,1.20,Premium,I,SI1,61.1,58.0,5510,6.88,6.80,4.18
3,1.50,Ideal,F,SI1,60.9,56.0,8770,7.43,7.36,4.50
4,0.90,Very Good,F,VS2,61.7,57.0,4493,6.17,6.21,3.82
...,...,...,...,...,...,...,...,...,...,...
4995,0.38,Ideal,H,VVS2,62.3,53.3,832,4.65,4.69,2.91
4996,0.33,Premium,G,VVS2,61.3,59.0,927,4.45,4.42,2.72
4997,1.25,Ideal,J,VVS2,62.1,56.0,5980,6.81,6.84,4.24
4998,0.31,Premium,F,VS2,62.9,58.0,802,4.31,4.27,2.70


## Custom Transformers

In [328]:
from sklearn.base import BaseEstimator, TransformerMixin

### Target transformation

In [329]:
class CustomTargetTransformer(BaseEstimator, TransformerMixin):
  # no need to implement __init__ in this particular case
  
  def fit(self, y):
    return self

  def transform(self, y):
    print('\n%%%%%%%%%%%%%%%custom_target_transform() called.\n')
    target = y.copy() 
    print(target[1])
    target = np.log(target)
    print(target[1])
    return target
   
  def inverse_transform(self, y):
    print('\n%%%%%%%%%%%%%%%custom_inverse_target_transform() called.\n')
    target = y.copy() 
    target = np.exp(target)
    return target

### Response transformer

In [330]:
#Remove not needed columns
class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, parameters = None):
        self.parameters = parameters #parameters that we want to remove
        print('\n>>>>>>>dropper: init() called.\n')
    
    def fit(self, X, y = None):
        print('\n>>>>>>>dropper: fit() called.\n')
        return self
    
    def transform(self, X, y = None):
        print('\n>>>>>>>dropper: transform() called.\n')
        X_ = X.copy()
        X_ = X_.drop(self.parameters, axis = 1)
        return X_
        
#remove all rows with at least one value <0 (errors) [Non funziona!!]   
class RemoveError(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        print("\n>>>Remove: init() called \n")
        
    def fit(self, X, y = y):
        print("\n>>>Remove: fit() called \n")
        return self
    
    def transform(self, X, y = y):
        print("\n>>>Remove: transform() called \n")
        df = X.copy()
        y_ = y.copy()
        print(df.shape)
        df = df.drop(df[df[df[list(df.select_dtypes(exclude ='object').columns)]<= 0].any(axis=1)].index)# slecting all rows with at least one values <= 0
        y_ = y_.drop(df[df[df[list(df.select_dtypes(exclude ='object').columns)]<= 0].any(axis=1)].index)# trovare modo migliore.
        
        print(df.shape)
        return df                                                                                        

    

## Pipeline for numerical features


In [331]:
numeric_processor=Pipeline(
    steps=[('dropper', FeatureDropper(["z", "y"])),
           ('remove_error', RemoveError()),
           ("scaler",MinMaxScaler())]

)

numeric_processor


>>>>>>>dropper: init() called.


>>>Remove: init() called 



## Pipeline for categorical data 

In [332]:
from sklearn.preprocessing import OneHotEncoder

categorical_processor=Pipeline(
    steps=[("imputation_consatnt",SimpleImputer(fill_value="missing",strategy="constant")),
           ("onehot",OneHotEncoder(handle_unknown="ignore"))]

)

categorical_processor

## Final pipeline

In [333]:
## combine processing technqiues
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [334]:
Variabili_numeriche = ['carat', 'depth', 'table','x', 'y', 'z']
Variabili_categoriche =list(df.select_dtypes('object').columns)


In [335]:
preprocessor=ColumnTransformer(
    [("categorical",categorical_processor, Variabili_categoriche),
    ("numerical",numeric_processor,Variabili_numeriche)]
)

In [336]:
pipe=make_pipeline(preprocessor,linear_model.LinearRegression())
pipe

### Model = pipeline + target trasformed

In [337]:
from sklearn.compose import TransformedTargetRegressor

model = TransformedTargetRegressor(regressor=pipe, 
                                   transformer=CustomTargetTransformer(), 
                                   check_inverse=False)


### Train the model 


In [338]:
#removing al negative prices -> dovrei farlo con il transformer Remove ma non va
id_0 = df[df['price']<0].index
df = df.drop(id_0)

y = df['price']  
X = df.drop('price', axis=1)

In [339]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, random_state = 6)



In [340]:
fitted = model.fit(X_train, y_train)


%%%%%%%%%%%%%%%custom_target_transform() called.

[15878]
[9.67268978]

>>>>>>>dropper: init() called.


>>>Remove: init() called 


>>>>>>>dropper: init() called.


>>>Remove: init() called 


>>>>>>>dropper: fit() called.


>>>>>>>dropper: transform() called.


>>>Remove: fit() called 


>>>Remove: transform() called 

(3742, 4)
(3742, 4)


In [341]:
model.score(X_train, y_train) #is the R2





>>>>>>>dropper: transform() called.


>>>Remove: transform() called 

(3742, 4)
(3742, 4)

%%%%%%%%%%%%%%%custom_inverse_target_transform() called.



0.9483059691004698

In [325]:
y_pred = model.predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))
print("R2: ",  model.score(X_test, y_test))
 


>>>>>>>dropper: transform() called.


>>>Remove: transform() called 

(1248, 4)
(1248, 4)

%%%%%%%%%%%%%%%custom_inverse_target_transform() called.

RMSE:  961.8733196069012

>>>>>>>dropper: transform() called.


>>>Remove: transform() called 

(1248, 4)
(1248, 4)

%%%%%%%%%%%%%%%custom_inverse_target_transform() called.

R2:  0.9411803243957957
