In [2]:
#model training:
import pandas as pd
import numpy as np


In [3]:
df=pd.read_csv("./data/train.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
df=df.drop(labels=["id"],axis=1)

In [5]:
#dependent and independent features
X=df.drop(labels=["price"],axis=1)
Y=df[["price"]]

In [6]:
#here definig which coloumns will be ordinal encoded and wich will be scaled
numerical_coloumns=X.select_dtypes(exclude="object").columns
categorical_columns=X.select_dtypes(include="object").columns

In [7]:
#numerical columns and categorial columns:
numerical_col=['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_col=['cut', 'color', 'clarity']

In [8]:
#here definging custom ranking for categorical features which are in ranking as per domain knowledge

cut_categories=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories=['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories=['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [9]:
#handling missing values , feature scaling:FE:->encoding
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

#pipelines: to combine multiple steps
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
#numerical pipeline

num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

#categorical pipeline

cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ("scaler",StandardScaler())

    ]
)

#combine num and cat feat:
preprocessor=ColumnTransformer([
    ("num_pipeline",num_pipeline,numerical_coloumns),
    ("cat_pipeline",cat_pipeline,categorical_columns)
])

In [11]:
#train test splittrain_test_split
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.30,random_state=10)

In [12]:
# preprocessor.fit_transform(X_train)

X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [13]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.780167,0.719352,-0.637086,-0.843944,-0.808252,-0.760667,-1.139986,-1.550963,0.682694
1,1.793874,-1.307964,0.403474,1.681406,1.751577,1.530872,-0.134981,0.297649,0.017348
2,-0.758536,-1.584416,0.403474,-0.690619,-0.762865,-0.833184,-0.134981,-1.550963,0.682694
3,0.063426,-0.478607,-0.637086,0.301483,0.281037,0.254572,0.870024,0.297649,-1.313344
4,-0.563861,0.627201,-0.637086,-0.483179,-0.508697,-0.441592,0.870024,0.297649,2.013386


In [14]:
#model trainng
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score

In [15]:
#Linear regression
regressor=LinearRegression()
regressor.fit(X_train,Y_train)

In [16]:
regressor.coef_

array([[ 6440.66932618,  -132.51375221,   -67.07538923, -2026.48028881,
         -163.1836616 ,  -102.79649896,    69.51256777,  -466.62678412,
          651.53843703]])

In [17]:
regressor.intercept_

array([3967.1463015])

In [18]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [19]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    "Decisiontree":DecisionTreeRegressor(),
    "RF":RandomForestRegressor()
}
trained_models={}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,Y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(Y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    trained_models.update({list(models.keys())[i]:r2_square})
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1014.8099241933487
MAE: 675.2796042191957
R2 score 93.7049390893374


Lasso
Model Training Performance
RMSE: 1014.94498711962
MAE: 676.3733606162151
R2 score 93.70326333527414


Ridge
Model Training Performance
RMSE: 1014.7955975981201
MAE: 675.3022121068898
R2 score 93.70511682932762


Elasticnet
Model Training Performance
RMSE: 1538.5127170586002
MAE: 1064.3039298403098
R2 score 85.53119358405031


Decisiontree
Model Training Performance
RMSE: 832.6479498367786
MAE: 424.48431257748996
R2 score 95.76207409423336




  return fit_method(estimator, *args, **kwargs)


RF
Model Training Performance
RMSE: 610.2475004195084
MAE: 310.47295516403284
R2 score 97.72363162350027




In [56]:
#models report on train data

trained_models_on_train_data={}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,Y_train)

    #Make Predictions
    y_pred=model.predict(X_train)

    mae, rmse, r2_square=evaluate_model(Y_train,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    trained_models_on_train_data.update({list(models.keys())[i]:r2_square})
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 1014.9773260682595
MAE: 675.9637612557515
R2 score 93.6566658624559


Lasso
Model Training Performance
RMSE: 1015.0380788389137
MAE: 677.0532425647857
R2 score 93.65590646291521


Ridge
Model Training Performance
RMSE: 1014.9774353304513
MAE: 675.9890661462005
R2 score 93.65666449673743


Elasticnet
Model Training Performance
RMSE: 1537.2072000202595
MAE: 1063.849143990589
R2 score 85.44977438427065


Decisiontree
Model Training Performance
RMSE: 13.265317688162309
MAE: 0.7233722752353612
R2 score 99.99891647180158




  return fit_method(estimator, *args, **kwargs)


RF
Model Training Performance
RMSE: 228.68811418994528
MAE: 116.19447400481906
R2 score 99.6779731424412




In [69]:
model_report=trained_models

list_model=sorted(model_report.items(),key=lambda X:X[1],reverse=True)
best_model_score=round(list_model[0][1]*100)
Best_model=list_model[0][0]
print(f"Best model: {Best_model}: r2_score: {best_model_score}%")
# logging.info(f"Best model: {Best_model}: r2_score: {best_model_score}%")

Best model: RF: r2_score: 98%


In [20]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'Elasticnet', 'Decisiontree', 'RF']

In [21]:
r2_list

[0.937049390893374,
 0.9370326333527415,
 0.9370511682932762,
 0.8553119358405031,
 0.9576207409423336,
 0.9772363162350026]

In [54]:
list_model=sorted(trained_models.items(),key=lambda X:X[1],reverse=True)
#sorted trained models as per r2 score.
# best_model_score=max(sorted(trained_models.values()))
# print(best_model_score)
# best_model_name=list(trained_models.keys())[list(trained_models.values()).index(best_model_score)]
# print(best_model_name)

0.9772363162350026
RF


In [23]:
list_model

[('RF', 0.9772363162350026),
 ('Decisiontree', 0.9576207409423336),
 ('Ridge', 0.9370511682932762),
 ('LinearRegression', 0.937049390893374),
 ('Lasso', 0.9370326333527415),
 ('Elasticnet', 0.8553119358405031)]

In [35]:
best_model_score=round(list_model[0][1]*100)
Best_model=list_model[0][0]
print(f"Best model: {Best_model}: r2_score: {best_model_score}%")

Best model: RF: r2_score: 98%


In [55]:
models[Best_model]

In [75]:
len(list(models))

6

In [78]:
def evaluate_model(X_train,Y_train,X_test,Y_test,models):
        test_report={}
        for i in range(len(list(models))):
            model=list(models.values())[i] #get one by one model
            #Train model
            model.fit(X_train,Y_train)

            #make prediction
            y_test_prediction=model.predict(X_test)

            test_model_score=r2_score(Y_test,y_test_prediction)

            #updating report
            test_report[list(models.keys())[i]]=test_model_score

        return test_report
evaluate_model(X_train,Y_train,X_test,Y_test,models)

  return fit_method(estimator, *args, **kwargs)


{'LinearRegression': 0.937049390893374,
 'Lasso': 0.9370326333527415,
 'Ridge': 0.9370511682932762,
 'Elasticnet': 0.8553119358405031,
 'Decisiontree': 0.9572123372752287,
 'RF': 0.9772859215749542}