In [19]:
## model traing
from sklearn.impute import SimpleImputer##handling missing values
from sklearn.preprocessing import StandardScaler##handling feature scaling
from sklearn.preprocessing import OrdinalEncoder
##pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import numpy as np
##train test split
df=pd.read_csv('./data/gemstone.csv')
df.head()
df=df.drop(labels=['id'],axis=1)
X=df.drop(labels=['price'],axis=1)
Y=df[['price']]
categorical_cols=X.select_dtypes(include='object').columns
numerical_cols=X.select_dtypes(exclude='object').columns
cut_categories=['Fair','Good','Very Good','Premium','Ideal']
color_categories=['D','E','F','G','H','I','J']
clarity_categories=['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
num_pipeline=Pipeline(
    steps=[('imputer',SimpleImputer(strategy='median'
                                    )),('scaler',StandardScaler())]
)
cat_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)
preprocessor=ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())
X_train.head()
regression=LinearRegression()
regression.fit(X_train,y_train)
regression.coef_
regression.intercept_
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square
##train nultiple models
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'ElasticNet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]
for i in range (len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mae,rmse,r2_square=evaluate_model(y_test,y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('model training performance')
    print('rmse',rmse)
    print('mae',mae)
    print('r2 square',r2_square*100)
    r2_list.append(r2_square)
    print('='*35)
    print('\n')




LinearRegression
model training performance
rmse 1013.9047094344003
mae 674.0255115796848
r2 square 93.68908248567512


Lasso
model training performance
rmse 1013.8784226767013
mae 675.071692336216
r2 square 93.68940971841704


Ridge
model training performance
rmse 1013.9059272771556
mae 674.0555800798325
r2 square 93.6890673250595


ElasticNet
model training performance
rmse 1533.4162456064046
mae 1060.7368759154729
r2 square 85.56494831165182


