In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, mean_squared_log_error
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_validate, train_test_split

In [2]:
data = pd.read_csv('train.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['Year'] = data['timestamp'].apply(lambda x: int(str(x)[:4]))
data['Month'] = data['timestamp'].apply(lambda x: int(str(x)[5:7]))
data['weekday'] = data['timestamp'].apply(lambda x: int(str(x)[8:10]))

target = data['price_doc']
data = data.drop(['timestamp', 'id', 'price_doc'], axis=1)

In [3]:
#numerical
num = data.select_dtypes(include=np.number)

#categorical
cat = data.select_dtypes(exclude=np.number)
for f in cat.columns:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(cat[f].values))
    cat[f] = lbl.transform(list(cat[f].values))


train = pd.concat([cat, num], axis=1)
train = train.fillna(train.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [4]:
var = VarianceThreshold()
var.fit(train, target)

variances = dict(zip(train.columns, var.variances_))
sorted_vars = sorted(variances, key = variances.get)
corr = train.corr()

threshold = 0.8
low_vars = sorted_vars[:27]
corr_groups = []

for f in low_vars:
    high = corr[f][corr[f] >= threshold]
    corr_groups.append(high)

sorted_corr_groups = sorted(corr_groups, key=lambda x: x.size, reverse=True)
feat_list = sorted_corr_groups[0].index

In [51]:
def simple_model_validation(train, target):
    clf = lgbm.LGBMRegressor(boosting_type = 'gbdt')

    avg_err = 0
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(train, target)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)

        error = mean_squared_log_error(pred, y_test)

        avg_err += error
    return avg_err.mean()


def special_model_validation(train, target, feat_list):
    clf = lgbm.LGBMRegressor(boosting_type = 'gbdt')
    reg = LinearRegression()

    avg_err = 0
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(train, target)
        for_reg_x = X_train.loc[:, feat_list]
        for_tree_x = X_train.drop(for_reg_x, axis=1)
        reg.fit(for_reg_x, y_train)
        reg_pred = reg.predict(for_reg_x)

        for_tree_x['reg'] = reg_pred
        clf.fit(for_tree_x, y_train)

        for_reg_x = X_test.loc[:, feat_list]
        for_tree_x = X_test.drop(for_reg_x, axis=1)
        reg_pred = reg.predict(for_reg_x)
        for_tree_x['reg'] = reg_pred
        pred = clf.predict(for_tree_x)

        error = mean_squared_log_error(pred, y_test)

        avg_err += error
    return avg_err.mean()

In [53]:
err1 = special_model_validation(train, target, feat_list)
err2 = simple_model_validation(train, target)
print("Model with regression : error =", err1)
print("Model without regression: error =", err2)

Model with regression : error = 2.2082528524403973
Model without regression: error = 2.19655884298606


In [7]:
sorted_vars

['mosque_count_500',
 'oil_chemistry_raion',
 'indust_part',
 'mosque_count_1000',
 'big_road1_1line',
 'nuclear_reactor_raion',
 'railroad_1line',
 'green_zone_part',
 'cafe_count_500_price_high',
 'railroad_terminal_raion',
 'mosque_count_1500',
 'thermal_power_plant_raion',
 'culture_objects_top_25',
 'incineration_raion',
 'water_1line',
 'mosque_count_2000',
 'big_market_raion',
 'green_zone_km',
 'detention_facility_raion',
 'cafe_count_1000_price_high',
 'school_education_centers_top_20_raion',
 'leisure_count_500',
 'market_count_500',
 'water_km',
 'university_top_20_raion',
 'mosque_count_3000',
 'radiation_raion',
 'product_type',
 'mosque_count_5000',
 'state',
 'cafe_count_500_price_4000',
 'num_room',
 'industrial_km',
 'market_count_1000',
 'church_synagogue_km',
 'catering_km',
 'cafe_count_1500_price_high',
 'Year',
 'build_count_foam',
 'market_count_1500',
 'big_church_count_500',
 'ecology',
 'material',
 'trc_count_500',
 'public_transport_station_km',
 'big_road1_

In [8]:
full_sq	life_sq	floor area_m, build_year

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,product_type,...,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,Year,Month,weekday
0,43,27.0,4.0,,,,,,,Investment,...,0,13,22,1,0,52,4,2011,8,20
1,34,19.0,3.0,,,,,,,Investment,...,0,15,29,1,10,66,14,2011,8,23
2,43,29.0,2.0,,,,,,,Investment,...,0,11,27,0,4,67,10,2011,8,27
3,89,50.0,9.0,,,,,,,Investment,...,1,4,4,0,0,26,3,2011,9,1
4,77,77.0,4.0,,,,,,,Investment,...,17,135,236,2,91,195,14,2011,9,5
5,67,46.0,14.0,,,,,,,Investment,...,1,53,78,1,20,113,17,2011,9,6
6,25,14.0,10.0,,,,,,,Investment,...,3,38,80,1,27,127,8,2011,9,8
7,44,44.0,5.0,,,,,,,Investment,...,0,11,18,1,0,47,4,2011,9,9
8,42,27.0,5.0,,,,,,,Investment,...,1,18,34,1,3,85,11,2011,9,10
9,36,21.0,9.0,,,,,,,Investment,...,0,10,20,1,3,67,1,2011,9,13


In [9]:
new_data = train.loc[:, ['full_sq', 'life_sq', 'area_m', 'build_year']]

In [11]:
clf = lgbm.LGBMRegressor(boosting_type = 'gbdt')

In [13]:
clf.fit(new_data, target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [16]:
pickle.dump(clf, open('model.pkl', 'wb'))

In [15]:
import pickle

In [17]:
clf.predict(new_data) 5490770.50179226

array([ 5490770.50179226,  5413002.39070529,  5881708.0251505 , ...,
        5052983.86562188, 11574274.87576037,  6087462.46702136])

In [40]:
new_data['build_year'][30442]

2015.0

In [25]:
d = data['build_year'][data['build_year'].notna()]

In [30]:
d.values

20052009.0

In [31]:
d.values.

array([1907., 1980., 2014., ..., 1935., 2003., 1968.])