In [1]:
! pip install pandas



In [31]:
import pandas as pd
df = pd.read_csv('data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [32]:
df = df.drop(labels=['id'],axis=1)

In [33]:
## Independent and dependent value sseparation
X = df.drop(labels=['price'],axis=1)
y = df[['price']]

In [34]:
## Sseparate catgorical value and Numerical value
cat_columns = X.select_dtypes(include='object').columns
num_columns = X.select_dtypes(exclude='object').columns

In [35]:
## Define custom ranking for each ordinal variable
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [37]:
from sklearn.impute import SimpleImputer            ## Handle missing Value
from sklearn.preprocessing import StandardScaler    ## Handle Feature Scaling
from sklearn.preprocessing import OrdinalEncoder    ## Ordinal Encoding 

## Build PipeLine
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [53]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('scaler',StandardScaler())
    ]
)

## combine 2 pipeline
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,num_columns),
('cat_pipeline',cat_pipeline,cat_columns)
])

In [54]:
## Train data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.3)

In [55]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [57]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [58]:
## Model Training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [76]:
import numpy as np
def evalute_model(y_test,y_pred):
    mae =mean_absolute_error(y_test,y_pred),
    mse = mean_squared_error(y_test,y_pred),
    rmse = np.sqrt(mse)
    r2_square = r2_score(y_test,y_pred),
    
    return mae,rmse,r2_square

In [74]:
## Train multiple values at_a_time
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    ##Predict
    y_pred = model.predict(X_test)
    mae,rmse,r2_square = evalute_model(y_test,y_pred)
    
    print("*** Model Training Performance ***")
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("-"*35)
    print("RMSE : ",rmse)
    print("MAE : ",mae)
    print("R2_score : ",r2_square)
    
    r2_list.append(r2_square)
    print("="*35)
    print("\n")    

*** Model Training Performance ***
LinearRegression
-----------------------------------
RMSE :  [1014.62966304]
MAE :  (675.0758270067483,)
R2_score :  (0.9362906819996049,)


*** Model Training Performance ***
Lasso
-----------------------------------
RMSE :  [1014.65913028]
MAE :  (676.2421173665509,)
R2_score :  (0.9362869814082755,)


*** Model Training Performance ***
Ridge
-----------------------------------
RMSE :  [1014.63432335]
MAE :  (675.1077629781366,)
R2_score :  (0.9362900967491631,)


*** Model Training Performance ***
ElasticNet
-----------------------------------
RMSE :  [1533.35412459]
MAE :  (1060.9432977143008,)
R2_score :  (0.8544967219374031,)


