In [1]:
# Importing essential libraries
import pandas as pd
import joblib as jb
from sklearn import preprocessing, metrics, impute, model_selection, compose, linear_model, tree, svm, ensemble

In [2]:
# Reading data
df = pd.read_csv('data.csv')
df.drop('car_ID',axis=1,inplace=True)
# Split in feature and target for future purpose
features = df.drop(['CarName','price'],axis=1)
target = df.price

# Names of numerical and categorical features
numerical_features = [col for col in features.columns if features[col].dtypes!='O']
categorical_features = [col for col in features.columns if col not in numerical_features]

# Train test split
x_train,x_test,y_train,y_test = model_selection.train_test_split(features,target,random_state=32)

# Nominal features
nominal_features = ['fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']

In [3]:
cleaning_pipeline_with_scaler = jb.load('cleanpipeline1.jb')
cleaning_pipeline_witthout_scaler = jb.load('cleanpipeline2.jb')

In [None]:
def compare_models_notree(x_train,x_test,y_train,y_test):
    score_table = {
        'Classifiers': ['Linear Regression','Ridge Regression','Lasso Regression','Support Vector Regression','Huber Regression'],
        'r_square':[],
        'mae':[],
        'mse': []
    }
    models = {'Linreg':linear_model.LinearRegression(),'ridge':linear_model.Ridge(),'lasso':linear_model.Lasso(),'svr':svm.SVR(),'huber':linear_model.HuberRegressor()}
    for model in models:
        models[model].fit(x_train.copy(),y_train.copy())
        y_pred = models[model].predict(x_test.copy())
        score_table['r_square'].append(metrics.r2_score(y_test.copy(),y_pred))
        score_table['mae'].append(metrics.mean_absolute_error(y_test.copy(),y_pred))
        score_table['mse'].append(metrics.mean_squared_error(y_test.copy(),y_pred))
        
    return pd.DataFrame(score_table)

compare_models_notree(cleaning_pipeline_with_scaler.fit_transform(x_train),cleaning_pipeline_with_scaler.transform(x_test),y_train,y_test)

In [None]:
# Without using any tree regressor, Huber Regression is performing great, let's see tree regressors

In [4]:
def compare_models_tree(x_train,x_test,y_train,y_test):
    score_table = {
        'Classifiers': ['Decision Tree','Random Forest','ADABoost'],
        'r_square':[],
        'mae':[],
        'mse': []
    }
    models = {'dt':tree.DecisionTreeRegressor(),'rf':ensemble.RandomForestRegressor(),'ada':ensemble.AdaBoostRegressor()}
    for model in models:
        models[model].fit(x_train.copy(),y_train.copy())
        y_pred = models[model].predict(x_test.copy())
        score_table['r_square'].append(metrics.r2_score(y_test.copy(),y_pred))
        score_table['mae'].append(metrics.mean_absolute_error(y_test.copy(),y_pred))
        score_table['mse'].append(metrics.mean_squared_error(y_test.copy(),y_pred))
        
    return pd.DataFrame(score_table)

compare_models_tree(cleaning_pipeline_with_scaler.fit_transform(x_train),cleaning_pipeline_with_scaler.transform(x_test),y_train,y_test)



Unnamed: 0,Classifiers,r_square,mae,mse
0,Decision Tree,0.919416,1394.653846,3835633.0
1,Random Forest,0.960129,1064.201651,1897794.0
2,ADABoost,0.930558,1496.739938,3305297.0


In [5]:
# Boooom , tree regressor is winning by the way i.e. Random Forest Regression