In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
import types
from sklearn.model_selection import GridSearchCV

# poltting the importance of features

In [None]:
def plot_feature_importance(dataset, model_bst):
    list_feature_name = list(dataset.columns[:])
    # list_feature_importance = list(model_bst.feature_importance(importance_type='split', iteration=-1))
    list_feature_importance = list(model_bst.feature_importances_)
    dataframe_feature_importance = pd.DataFrame(
        {'feature_name': list_feature_name, 'importance': list_feature_importance})
    dataframe_feature_importance20 = dataframe_feature_importance.sort_values(by='importance', ascending=False)[:20]
    print(dataframe_feature_importance20)
    x = range(len(dataframe_feature_importance20['feature_name']))
    plt.xticks(x, dataframe_feature_importance20['feature_name'], rotation=90, fontsize=8)
    plt.plot(x, dataframe_feature_importance20['importance'])
    plt.xlabel("Feature name")
    plt.ylabel("Importance")
    plt.title("The importance of features")
    plt.show()

## generating the required dataset

In [None]:
ori_test = pd.read_csv("../data/test.csv")
ori_train = pd.read_csv("../data/train.csv")
index = ["depreciation", "coe", "dereg_value", "arf", "reg_date", "power", "omv", "engine_cap","make", "transmission",
         "no_of_owners",
         "model", "category", "mileage", "road_tax"]
train_index = ["depreciation", "coe", "dereg_value", "arf", "reg_date", "power", "omv", "engine_cap","make", "transmission",
         "no_of_owners",
         "model", "category", "mileage", "road_tax", "price"]
ori_test = ori_test[index]
ori_train = ori_train[train_index]
ori_train = ori_train.astype({"model":"category", "category": "category", "transmission":"category", "no_of_owners":"category", "make":"category"})
ori_test = ori_test.astype({"model":"category", "category": "category","transmission":"category", "no_of_owners":"category", "make":"category"})
ori_test['reg_date'] = ori_test['reg_date'].map(lambda x: 2021 - int(x[-5:]) if type(x)==type('a') else 0)
ori_train['reg_date'] = ori_train['reg_date'].map(lambda x: 2021 - int(x[-5:]) if type(x)==type('a') else 0)
train_x = ori_train[index]
train_y = ori_train['price']

## Parameter searching


In [None]:
# max_depth num_leaves 4 16
model_lgb = lgb.LGBMRegressor(objective='regression',boosting_type='gbdt',
                                learning_rate=0.05, n_estimators=1500,
                                metric='rmse', zero_as_missing=True)

params_test1 = {
    'max_depth': range(4, 9, 1),
    'num_leaves': range(5, 25, 5)
}

gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_root_mean_squared_error', cv=5,
                        verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print('gsearch1.grid_scores_', gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print('gsearch1.best_params_', gsearch1.best_params_)
print('gsearch1.best_score_', gsearch1.best_score_)

In [None]:
# n_estimator learning_rate 0.05, 1500
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=16
                                                            , max_depth=4,
                                                            metric='rmse',zero_as_missing=True)

params_test1 = {
    'n_estimators': [800, 1000, 1200, 1500],
    'learning_rate':[0.02, 0.05, 0.1, 0.01]
}
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_root_mean_squared_error', cv=5,
                        verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print('gsearch1.grid_scores_', gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print('gsearch1.best_params_', gsearch1.best_params_)
print('gsearch1.best_score_', gsearch1.best_score_)

In [None]:
#min_child_sample min_child_weight 20 0
params_test3 = {
        'min_child_samples': [18, 19, 20, 21, 22],
        'min_child_weight': [0, 0.001, 0.002]
    }
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=16,
                              learning_rate=0.05, n_estimators=1500, max_depth=4,
                              metric='rmse', zero_as_missing=True)
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test3, scoring='neg_root_mean_squared_error', cv=5,
                        verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print('gsearch1.grid_scores_', gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print('gsearch1.best_params_', gsearch1.best_params_)
print('gsearch1.best_score_', gsearch1.best_score_)

In [None]:
# feature_fractionå’Œbagging_fraction  0.6, 1.0
params_test4 = {
    'feature_fraction': [0.6, 0.7, 0.5],
    'bagging_fraction': [0.8, 0.9, 1.0],
    # 'bagging_freq':[3, 5, 10]
}
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=16,
                              learning_rate=0.05, n_estimators=1500, max_depth=4,
                              metric='rmse', min_child_samples=20, min_child_weight=0, bagging_freq=5, zero_as_missing=True)
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, scoring='neg_root_mean_squared_error', cv=5,
                        verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print('gsearch1.grid_scores_', gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print('gsearch1.best_params_', gsearch1.best_params_)
print('gsearch1.best_score_', gsearch1.best_score_)

In [None]:
#  reg_alpha 0, reg_lambda 0.001
params_test6 = {
    'reg_alpha': [0, 0.05, 0.1],
    'reg_lambda': [0, 0.001]
}
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=16,
                                                            learning_rate=0.05, n_estimators=1500, max_depth=4,
                                                            metric='rmse', min_child_samples=20, bagging_freq=5,
                              min_child_weight=0,
                              feature_fraction=0.6, bagging_fraction=1, zero_as_missing=True)
gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test6, scoring='neg_root_mean_squared_error', cv=5,
                        verbose=1, n_jobs=4)
gsearch1.fit(train_x, train_y)
print('gsearch1.grid_scores_', gsearch1.cv_results_['mean_test_score'])
print(gsearch1.cv_results_['params'])
print('gsearch1.best_params_', gsearch1.best_params_)
print('gsearch1.best_score_', gsearch1.best_score_)

## Generating submission file

In [None]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=16,
                                                            learning_rate=0.05, n_estimators=1500, max_depth=4,
                                                            metric='rmse', min_child_samples=20, bagging_freq=5,
                              min_child_weight=0,
                              feature_fraction=0.6, bagging_fraction=1, reg_alpha=0, reg_lambda=0.001,zero_as_missing=True)
model_lgb.fit(train_x, train_y)
print("here")
lgb.plot_tree(model_lgb, figsize=(1024, 4096))
plt.savefig("lgb_tree")
plot_feature_importance(train_x, model_bst=model_lgb)
y_pred = model_lgb.predict(ori_test)
y_pred = np.around(y_pred, 1)
y_pred = y_pred.tolist()
t_id_list = [i for i in range(len(y_pred))]
res_df = pd.DataFrame({"Id": t_id_list, "Predicted": y_pred})
res_df = res_df.drop_duplicates(subset=["Id"], keep="first")
res_df.to_csv("submit_lgbm_simple_1106_2021.csv", index=None)