In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.special import boxcox, inv_boxcox
import warnings
import re
from sklearn import preprocessing
warnings.filterwarnings('ignore')

# Loading and processing data

In [2]:
train = pd.read_csv(open('train.csv'), parse_dates=['timestamp'])
test = pd.read_csv(open('test.csv'), parse_dates=['timestamp'])

Removing price outliers 

In [3]:
train=train[(train.price_doc>1e6) & (train.price_doc!=2e6) & (train.price_doc!=3e6)]
train.loc[(train.product_type=='Investment') & (train.build_year<2000),'price_doc']*=0.9 
train.loc[train.product_type!='Investment','price_doc']*=0.969

In [4]:
y_train = np.log(train['price_doc'])
train.drop(['price_doc'], inplace=True, axis=1)
X = pd.concat([train,test])
X['na_counts'] = X.isnull().sum(axis=1)
df_categorical = None
remove = []

for i in X.columns:
    if X[i].dtype == 'object':
        converted = pd.get_dummies(X[i], prefix=i)
        if df_categorical is None:
            df_categorical = converted
        else:
            df_categorical = pd.concat([df_categorical, converted], axis=1)
        remove.append(i)

X.drop(remove, inplace=True, axis=1)

In [5]:
remove = []
if df_categorical is not None:
    sum = df_categorical.sum(axis=0)
    remove = sum[sum<200].index.values
    df_categorical = df_categorical.loc[:,df_categorical.columns.difference(remove)]
    X = pd.concat([X, df_categorical], axis=1)

# Applying LightGBM Regressor Model

In [6]:
num_train = train.shape[0]
x_train = X[:num_train].drop(['timestamp','id'], axis=1)
x_test = X[num_train:].drop(['timestamp','id'], axis=1)

In [7]:
params = {  'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': 0.01 ,
            'verbose': 0,
            'num_leaves': 32,
            'bagging_fraction': 0.95,
            'bagging_freq': 1,
            'bagging_seed': 1,
            'feature_fraction': 0.7,
            'feature_fraction_seed': 1,
            'max_bin': 100,
            'max_depth': 7,
            'num_rounds': 1500,}

lgtrain = lgb.Dataset(x_train, y_train)
model = lgb.train(params, lgtrain, 1500)
pred = model.predict(x_test, num_iteration=model.best_iteration)

print("LightGBM Training Completed.")

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


In [None]:
transformed_pred = np.exp(pred)
my_submission = pd.DataFrame({'id': test.id, 'price_doc': transformed_pred})
my_submission.to_csv('submission.csv', index=False)