In [279]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error

## Read data

In [200]:
train = pd.read_csv('train-2.csv')
train.head()

Unnamed: 0,house_id,dt,n_bedrooms,n_bathrooms,n_floors,S_above,S_basement,S_living,S_lot,lat,long,year_built,status,price_target
0,101826,2014-07-17,2,1.75,1.5,1740,0,1740,6620,47.526,-121.828,2002,3,350000.0
1,105715,2015-04-21,2,1.0,1.5,1090,0,1090,5265,47.6638,-122.292,1947,4,577000.0
2,118631,2014-09-26,3,2.0,1.0,1310,0,1310,7000,47.303,-122.383,1979,4,196500.0
3,116653,2014-05-02,3,2.5,3.0,1600,170,1770,1235,47.6965,-122.342,2007,3,436110.0
4,119014,2015-03-12,3,3.25,2.0,1090,190,1280,1730,47.7032,-122.36,2005,3,375000.0


In [201]:
test = pd.read_csv('test-2.csv')
test.head()

Unnamed: 0,house_id,dt,n_bedrooms,n_bathrooms,n_floors,S_above,S_basement,S_living,S_lot,lat,long,year_built,status
0,121076,2015-04-01,3,2.25,1.0,1930,440,2370,38639,47.771,-122.099,1978,3
1,107763,2015-03-30,3,2.5,2.0,2420,920,3340,70131,47.2666,-122.015,1994,3
2,115852,2014-12-12,3,1.0,1.0,1090,0,1090,10296,47.7743,-122.26,1950,4
3,107325,2014-10-15,4,2.5,1.0,1560,880,2440,9350,47.5614,-122.13,1976,4
4,119391,2014-11-13,2,1.5,2.0,840,140,980,1296,47.7075,-122.336,2001,3


In [202]:
X = train.drop(['price_target', 'house_id'], axis=1)
y = train['price_target']

In [101]:
X_test = test.drop(['house_id'], axis=1)

In [240]:
train['S_basement'].apply(lambda x: 1 if x==0 else 0).value_counts()

1    6066
0    3934
Name: S_basement, dtype: int64

## Preproc

In [241]:
def preproc(df):
    temp = df.copy()
    
    temp['dt'] = pd.to_datetime(temp['dt'] , errors='coerce')
    
    temp['1_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==1 else 0)
    temp['2_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==2 else 0)
    temp['3_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==3 else 0)
    temp['4_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==4 else 0)
    temp['5_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==5 else 0)
    temp['6_bedrooms'] = temp['n_bedrooms'].apply(lambda x: 1 if x==6 else 0)
    
    temp['1_floors']   = temp['n_floors'].apply(lambda x: 1 if x==1 else 0)
    temp['2_floors']   = temp['n_floors'].apply(lambda x: 1 if x==2 else 0)
    temp['3_floors']   = temp['n_floors'].apply(lambda x: 1 if x==3 else 0)
    temp['1_5_floors'] = temp['n_floors'].apply(lambda x: 1 if x==1.5 else 0)
    temp['2_5_floors'] = temp['n_floors'].apply(lambda x: 1 if x==2.5 else 0)
    temp['3_5_floors'] = temp['n_floors'].apply(lambda x: 1 if x==3.5 else 0)
    
    temp['2014_year'] = temp['dt'].apply(lambda x: 1 if x.year==2014 else 0)
    
    temp['winter'] = temp['dt'].apply(lambda x: 1 if x.month in [12,1,2] else 0)
    temp['spring'] = temp['dt'].apply(lambda x: 1 if x.month in [3,4,5] else 0)
    temp['summer'] = temp['dt'].apply(lambda x: 1 if x.month in [6,7,8] else 0)
    
    temp['2_status'] = temp['status'].apply(lambda x: 1 if x==2 else 0)
    temp['3_status'] = temp['status'].apply(lambda x: 1 if x==3 else 0)
    temp['4_status'] = temp['status'].apply(lambda x: 1 if x==4 else 0)
    temp['5_status'] = temp['status'].apply(lambda x: 1 if x==5 else 0)
    
    temp['lt_1930_year']   = temp['year_built'].apply(lambda x: 1 if x <= 1930 else 0)
    temp['1930_1960_year'] = temp['year_built'].apply(lambda x: 1 if x > 1930 and x <= 1960 else 0)
    temp['1960_1990_year'] = temp['year_built'].apply(lambda x: 1 if x > 1960 and x <= 1990 else 0)
    
    cols_to_drop = ['n_bedrooms', 'n_floors', 'dt', 'status', 'year_built']
    temp.drop(cols_to_drop, axis=1, inplace=True)
    
    
    return temp

In [242]:
X_preproc = preproc(X) 
X_test_preproc = preproc(X_test) 

In [243]:
scaler = StandardScaler()
cols_to_scale = ['S_above', 'S_basement', 'S_living', 'S_lot']

scaler.fit(X_preproc[cols_to_scale])

X_scaled = scaler.transform(X_preproc[cols_to_scale])
X_test_scaled  = scaler.transform(X_test_preproc[cols_to_scale])

  return self.partial_fit(X, y)
  
  import sys


In [244]:
X_preproc[cols_to_scale] = X_scaled
X_test_preproc[cols_to_scale] = X_test_scaled

In [245]:
X_preproc.head()

Unnamed: 0,n_bathrooms,S_above,S_basement,S_living,S_lot,lat,long,1_bedrooms,2_bedrooms,3_bedrooms,...,winter,spring,summer,2_status,3_status,4_status,5_status,lt_1930_year,1930_1960_year,1960_1990_year
0,1.75,-0.074934,-0.666084,-0.389301,-0.209437,47.526,-121.828,0,1,0,...,0,0,1,0,1,0,0,0,0,0
1,1.0,-0.852226,-0.666084,-1.096983,-0.241988,47.6638,-122.292,0,1,0,...,0,1,0,0,0,1,0,0,1,0
2,2.0,-0.589142,-0.666084,-0.85746,-0.200308,47.303,-122.383,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3,2.5,-0.242351,-0.282118,-0.356639,-0.338801,47.6965,-122.342,0,0,1,...,0,1,0,0,1,0,0,0,0,0
4,3.25,-0.852226,-0.236946,-0.890122,-0.32691,47.7032,-122.36,0,0,1,...,0,1,0,0,1,0,0,0,0,0


## KNN-feats

In [300]:
X_preproc['lat'].max() - X_preproc['lat'].min()

0.621699999999997

In [301]:
X_preproc['long'].max() - X_preproc['long'].min()

1.2040000000000077

In [287]:
from sklearn.neighbors import KNeighborsRegressor

In [288]:
knn = KNeighborsRegressor(n_neighbors=3, weights='uniform', metric='euclidean')

In [302]:
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='euclidean',
          metric_params=None, n_jobs=None, n_neighbors=3, p=2,
          weights='uniform')

In [307]:
knn_pred_valid = knn.predict(X_preproc)

In [321]:
knn_pred_test = knn.predict(X_test_preproc)

In [309]:
mean_squared_log_error(knn_pred_valid, y)

0.08623302006419582

In [322]:
X_preproc['knn_pred'] = knn_pred_valid
X_test_preproc['knn_pred'] = knn_pred_test

## Train

In [311]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold

In [312]:
def train_test_split(X, y, test_size, random_state=1):
    
    random_gen = np.random.RandomState(random_state)
    size = X.shape[0]
    batch_size = round(size*test_size)
    
    rand_indices = list(random_gen.choice(size, batch_size))
   
    return X.drop(rand_indices, axis=0), X.loc[rand_indices], y.drop(rand_indices, axis=0), y.loc[rand_indices]

In [313]:
X_train, X_valid, y_train, y_valid = train_test_split(pd.DataFrame(X_preproc), y, test_size=0.3, random_state=1)

In [314]:
print('train: {}'.format(X_train.shape))
print('valid: {}'.format(X_valid.shape))

train: (7423, 31)
valid: (3000, 31)


In [315]:
lr = Lasso()

In [316]:
lr.fit(X_train, y_train)



Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## Predict

In [317]:
pred_valid = lr.predict(X_valid)

In [318]:
pred_valid_upd = [x if x > 0 else 0 for x in pred_valid]

In [319]:
mean_squared_log_error(pred_valid_upd, y_valid)

0.12890121134115357

In [323]:
pred_test = lr.predict(X_test_preproc)

In [325]:
pred_test_upd = [x if x > 0 else 0 for x in pred_test]

In [334]:
to_kaggle = pd.DataFrame([])
to_kaggle['house_id'] = test['house_id']
to_kaggle['price_target'] = [round(x) for x in pred_test_upd]

In [335]:
to_kaggle.to_csv('submission_1.csv', index=None)