In [1]:
""" importing packages """

""" general computing packages """
import pandas as pd
import numpy as np

""" packages for data preperation and transformations"""
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

""" packages for machine learning algorithms """
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

""" packages for model metrics and model tuning and selection """
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

import joblib
%matplotlib inline

In [2]:
""" Getting processed data """

data_imputed = pd.read_csv('cleaned_imputed_data.csv')

""" seperating features from the labels """

X = data_imputed.drop(columns=['selling_price'], inplace=False)
y = data_imputed['selling_price']

""" keeping 1% data for later (model presentation)"""
features, X_saved, label, y_saved = train_test_split(X, y, train_size=0.99)
"""
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                                label, train_size=0.8)
"""

'\nX_train, X_test, y_train, y_test = train_test_split(features, \n                                                                label, train_size=0.8)\n'

In [8]:
saved_data = pd.concat([pd.DataFrame(X_saved), pd.DataFrame(y_saved)], axis=1)

In [10]:
saved_data.to_csv('Savedup_Data.csv', index=False)

In [11]:
"""
Column Transformer - categorical encoding and scaling and transformation of numeric variables

use for regression and any other distance based algorithms
"""

pre_processing = ColumnTransformer(transformers= [ 
    ('winsorization', RobustScaler(with_centering=False, with_scaling=False, quantile_range=(5.0,95.0), copy=False) ,['km_driven']),
    ('num_transform', PowerTransformer(copy=False), ['year','km_driven','mileage','engine_cc','max_power_bhp']),
    ('categorical_enc',OneHotEncoder(drop='first', sparse=False),['Company','fuel','owner']),
    ('transmission_binary', OneHotEncoder(drop='if_binary', sparse=False), ['transmission','seller_type'])    
], remainder='passthrough', n_jobs=-1)


In [12]:
xgboost = XGBRegressor(objective="reg:squarederror", n_estimators=400, booster='gbtree', n_jobs=-1, learning_rate = 0.1, max_depth=8)

XgBoostPipe = make_pipeline(
                                pre_processing,
                                xgboost,
                                verbose=False)

In [13]:
XgBoostPipe.fit(features, label)


In [14]:
joblib.dump(XgBoostPipe, 'XGBRegression_model.joblib')

['XGBRegression_model.joblib']