# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import wandb

#regression models
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

# Config

In [None]:
wandb.login(key='b75e0564aba32dce859c60044418df71ce7389a8')

# Import Preprocessed Data

In [None]:
data = pd.read_csv('../input/naalaiya-thiran/Preprocessed/autos_preprocessed.csv', header=0, sep=',', encoding='Latin1')

# Label Encoding

In [None]:
labels = ['gearbox', 'notRepairedDamage', 'model', 'brand', 'fuelType', 'vehicleType']

mapper = {}
for i in labels:
    mapper[i] = LabelEncoder()
    mapper[i].fit(data[i])
    tr = mapper[i].transform(data[i])
    np.save(str('classes'+i+'.npy'), mapper[i].classes_)
    data.loc[:, i+'_labels'] = pd.Series(tr, index=data.index)
    
labeled = data[['price', 'yearOfRegistration','powerPS','kilometer','monthOfRegistration']
                  +[x+"_labels" for x in labels]]

print(labeled.columns)

# Score Evaluation

In [None]:
def find_scores(Y_actual, Y_pred, X_train):
    mae = mean_absolute_error(Y_actual, Y_pred)
    mse = mean_squared_error(Y_actual, Y_pred)
    rmse = np.sqrt(mse)
    rmsle = np.log(rmse)
    r2 = r2_score(Y_actual, Y_pred)
    n, k = X_train.shape
    adj_r2_score = 1 - ((1-r2)*(n-1)/(n-k-1))
    
    wandb.log({"mae": mae, "mse": mse, 'rmse':rmse, 'rmsle':rmsle, 'r2':r2, 'adj_r2':adj_r2_score})

# Predictive Modeling

## Bagging Regressor

In [None]:
def bagging_regressor():
    config_defaults = {
                'n_estimators':100,
                'max_samples':0.4,
                'bootstrap':True,
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = BaggingRegressor(
      n_estimators=config.n_estimators, 
      bootstrap=config.bootstrap,
      max_samples=config.max_samples,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
bagging_regressor_configs = {
    "name":'BaggingRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "n_estimators": {
            "values": [100, 200, 300]
        },
        "max_samples": {
            "values": [0.4,0.5, 0.6]
        }
    }
}

sweep_id = wandb.sweep(sweep=bagging_regressor_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=bagging_regressor)

## Bagging Regressor Results

<img src="./Bagging Regressor.png" alt="Bagging Regressor Results"/>

## RandomForest Regressor

In [None]:
def random_forest_regressor():
    config_defaults = {
                'n_estimators':100,
                'max_samples':0.4,
                'criterion':'squared_error',
                'bootstrap': True,
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = RandomForestRegressor(
      n_estimators=config.n_estimators,
      criterion = config.criterion,
      bootstrap=config.bootstrap,
      max_samples=config.max_samples,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
random_forest_configs = {
    "name":'RandomForestRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "n_estimators": {
            "values": [100, 200, 300]
        },
        "max_samples": {
            "values": [0.4,0.5, 0.6]
        }
    }
}

sweep_id = wandb.sweep(sweep=random_forest_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=random_forest_regressor)

## Random Forest Regressor Results

<img src="./Random Forest Regressor.png" alt="Random Forest Regressor Results"/>

## HistGradient Boosting Regressor

In [None]:
def hist_gradient_boost_regressor():
    config_defaults = {
                'loss':'squared_error',
                'learning_rate': 0.1,
                'max_iter':100,
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = HistGradientBoostingRegressor(
      loss=config.loss,
      learning_rate = config.learning_rate,
      max_iter=config.max_iter,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
hist_gradient_boost_configs = {
    "name":'HistGradientBoostingRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "loss": {
            "values": ['squared_error', 'absolute_error']
        },
        "learning_rate": {
            "values": [0.01, 0.03, 0.05, 0.07]
        },
        "max_iter": {
            "values": [100,200,300]
        },
        "random_state": {
            "values": [42]
        }
    }
}

sweep_id = wandb.sweep(sweep=hist_gradient_boost_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=hist_gradient_boost_regressor)

## Bagging Regressor Results

<img src="./HOG Boosting Regressor.png" alt="HistGradient Boosting Regressor Results"/>

## Extra Trees Regressor

In [None]:
def extra_tree_regressor():
    config_defaults = {
                'criterion':'squared_error',
                'max_samples':0.4,
                'bootstrap': True,
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = ExtraTreesRegressor(
      criterion=config.criterion,
      bootstrap = config.bootstrap,
      max_samples=config.max_samples,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
extra_tree_configs = {
    "name":'ExtraTreesRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "criterion": {
            "values": ['squared_error', 'absolute_error']
        },
        "max_samples": {
            "values": [0.4,0.5, 0.6]
        }
    }
}

sweep_id = wandb.sweep(sweep=extra_tree_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=extra_tree_regressor)

## Extra Tree Regressor Results

<img src="./Extra Tree Regressor.png" alt="Extra Tree Regressor Results"/>

## XGB Regressor

In [None]:
def XGB_regressor():
    config_defaults = {
                'learning_rate':0.1,
                'n_estimators': 500,
                'booster':'gbtree',
                'eta':0.01,
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = XGBRegressor(
      learning_rate=config.learning_rate,
      n_estimators = config.n_estimators,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
extra_tree_configs = {
    "name":'XGBRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "values": [0.01, 0.03, 0.05, 0.07]
        },
        "n_estimators": {
            "values": [100,200,300]
        },
        "booster": {
            "values": ['gbtree','gblinear']
        },
        "eta": {
            "values": [0.01, 0.03, 0.05, 0.07]
        }
    }
}

sweep_id = wandb.sweep(sweep=extra_tree_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=XGB_regressor)

## XGB Regressor Results

<img src="./XGB Regressor.png" alt="XGB Regressor Results"/>

## LGBM Regressor

In [None]:
def LGBM_regressor():
    config_defaults = {
                'objective':'root_mean_squared_error',
                'reg_sqrt': True,
                'metric':'rmse',
                'random_state':42
            }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    X = labeled.iloc[:,1:].values
    Y = labeled.iloc[:,0].values.reshape(-1,1)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
    
    model = LGBMRegressor(
      learning_rate=config.learning_rate,
      n_estimators = config.n_estimators,
      random_state = config.random_state)
    
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    
    find_scores(Y_test, Y_pred, X_train)

In [None]:
lgbm_configs = {
    "name":'LGBMRegressor',
    "method": "grid",
    "metric": {
        "name": "adj_r2",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "values": [0.01, 0.03, 0.05, 0.07]
        },
        "objective": {
            "values": ['root_mean_squared_error']
        },
        "boosting_type": {
            "values": ['gbdt','dart','goss','rf']
        },
        "reg_sqrt": {
            "values": [True]
        },
        "metric": {
            "values": ['rmse']
        },
        "n_estimators": {
            "values": [100,200,300]
        },
        "random_state": {
            "values": [42]
        }
    }
}

sweep_id = wandb.sweep(sweep=lgbm_configs, project="car_resale_value")
wandb.agent(sweep_id=sweep_id, function=LGBM_regressor)

## LGBM Regressor Results

<img src="./LGBM Regressor.png" alt="LGBM Regressor Results"/>