In [1]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import linear_model
from sklearn import svm
from collections import Counter
import sklearn.cross_validation
import math
import time
from scipy import stats
from sklearn.linear_model import RandomizedLasso
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import sys
%matplotlib inline

['data_sci_snippet.csv']


In [3]:
filename = "data_sci_snippet.csv.zip"
zf = zipfile.ZipFile(filename, 'r')
print zf.namelist()

def time_preprocessing(df, column_name):
    
    times = []
    count = 0
    start = time.time()
    for i in df[column_name]:
        count += 1
        if count % 100000 == 0:
            time_elapsed = time.time() - start
            #print "Count = %r, Time Elapsed = %r" %(count, time_elapsed)
        times.append(time.strptime(i, "%Y-%m-%d"))

    year = []
    month = []
    day = []
    day_of_week = []
    day_in_year = []

    for i in times:
        year.append(i[0])
        month.append(i[1])
        day.append(i[2])
        day_of_week.append(i[6])
        day_in_year.append(i[7])
    df[column_name + ' year'] = year
    df[column_name + ' month'] = month
    df[column_name + ' day'] = day
    df[column_name + ' day_of_week'] = day_of_week
    df[column_name + ' day_in_year'] = day_in_year
    df.drop([column_name], axis=1)
    return df

data = pd.read_csv(zf.open('data_sci_snippet.csv'))

one_hot_columns = ['Pool','ListingStatus','DwellingType']
count = 0
for i in one_hot_columns:
    dummies = pd.get_dummies(data[i])
    for j in dummies:
        data[j] = dummies[j]
        count += 1

data = time_preprocessing(data, 'ListDate')
data = data.dropna()

# Note that we won't have closedate in our test data
# Could still build a model for closing date and use that as a feature
#data = time_preprocessing(data, 'CloseDate')
data = data.drop(['CloseDate'], 1)

# Remove Outliers
data = data[(data['ClosePrice'] > 10000)]
data = data[(data['ClosePrice'] < 500000)]
data = data[(data['ListPrice'] <= 7000000)]

data = data.drop(['Pool', 'ListingStatus', 'DwellingType', 'ListDate', 'PublicRemarks', 'ListDate year'], axis=1)

data['ListPrice'] = np.log(data['ListPrice'])
data['ClosePrice'] = np.log(data['ClosePrice'])
preprocessing_array = []
for i in data:
    if i == 'ClosePrice':
        closeprice_mean = data[i].mean()
        closeprice_std = data[i].std()
    preprocessing_array.append([i, data[i].mean(), data[i].std()])
    data[i] = data[i] - data[i].mean()
    data[i] = data[i]/float(data[i].std())

def unprocess(data, mean = closeprice_mean, std = closeprice_std):
    return np.exp(data*closeprice_std+closeprice_mean)

y = data['ClosePrice']
x = data.drop(['ClosePrice'], 1)
columns = data.columns
x_train, x_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    x, y, test_size=0.02, random_state=14)

data = pd.DataFrame(x_train, columns=columns[:-1])
data['ClosePrice'] = y_train
test_data = pd.DataFrame(x_test, columns=columns[:-1])
test_data['ClosePrice'] = y_test

full = pd.DataFrame()
for_test = pd.DataFrame()
for i in data:
    for j in data:
        if i != 'ClosePrice' and j != 'ClosePrice':
            full[i + '*' + j] = data[i]*data[j]
            for_test[i + '*' + j] = test_data[i]*test_data[j]
full['ClosePrice'] = data['ClosePrice']
correlations = full.corrwith(full['ClosePrice'])
columns = full.columns
for i in range(len(correlations)):
    if abs(list(correlations)[i]) > .3:
        data[columns[i]] = full[columns[i]]
        if columns[i] != 'ClosePrice':
            test_data[columns[i]] = for_test[columns[i]]

stacking_data = data[:len(data)/2]
training_data = data[len(data)/2:]

x_train_stacking = stacking_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
#                 'ListPrice*ListPrice',
#                  'ListPrice*Single Family - Detached',
#                  'Community*None',
#                  'None*Community',
#                  'Private*Private',
#                  'Apartment Style/Flat*Mfg/Mobile Housing',
#                  'Apartment Style/Flat*Townhouse',
#                  'Mfg/Mobile Housing*Apartment Style/Flat',
#                  'Mfg/Mobile Housing*Townhouse',
#                  'Single Family - Detached*ListPrice',
#                  'Single Family - Detached*Single Family - Detached',
#                  'Townhouse*Apartment Style/Flat',
#                  'Townhouse*Mfg/Mobile Housing'
               ]]
y_train_stacking = stacking_data['ClosePrice']

x_train_training = training_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
#                      'ListPrice*ListPrice',
#                      'ListPrice*Single Family - Detached',
#                      'Community*None',
#                      'None*Community',
#                      'Private*Private',
#                      'Apartment Style/Flat*Mfg/Mobile Housing',
#                      'Apartment Style/Flat*Townhouse',
#                      'Mfg/Mobile Housing*Apartment Style/Flat',
#                      'Mfg/Mobile Housing*Townhouse',
#                      'Single Family - Detached*ListPrice',
#                      'Single Family - Detached*Single Family - Detached',
#                      'Townhouse*Apartment Style/Flat',
#                      'Townhouse*Mfg/Mobile Housing'
               ]]
y_train_training = training_data['ClosePrice']

x_test_listprice = test_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
#                      'ListPrice*ListPrice',
#                      'ListPrice*Single Family - Detached',
#                      'Community*None',
#                      'None*Community',
#                      'Private*Private',
#                      'Apartment Style/Flat*Mfg/Mobile Housing',
#                      'Apartment Style/Flat*Townhouse',
#                      'Mfg/Mobile Housing*Apartment Style/Flat',
#                      'Mfg/Mobile Housing*Townhouse',
#                      'Single Family - Detached*ListPrice',
#                      'Single Family - Detached*Single Family - Detached',
#                      'Townhouse*Apartment Style/Flat',
#                      'Townhouse*Mfg/Mobile Housing'
                   ]]
y_test = test_data['ClosePrice']

['data_sci_snippet.csv']


In [None]:
x_train = data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse'
        ]]
y_train = data['ClosePrice']
    

In [7]:
for i in range(1,40):
    bayesian_ridge = linear_model.Ridge(alpha = i/2.0)
    bayesian_ridge.fit(x_train, y_train)
    bayesian_ridge_predictions = bayesian_ridge.predict(x_test_listprice)
    median = np.median(abs((unprocess(bayesian_ridge_predictions))-unprocess(y_test)))
    print(median, i/2.0)

(3327.7889718110964, 0.5)
(3327.1239794666326, 1.0)
(3326.4590014815913, 1.5)
(3325.7940378605708, 2.0)
(3325.1290886081842, 2.5)
(3325.8945005880669, 3.0)
(3329.749725203088, 3.5)
(3333.6045259448583, 4.0)
(3335.2371835059894, 4.5)
(3332.8939118921407, 5.0)
(3330.5509187112038, 5.5)
(3328.2082039088418, 6.0)
(3325.8657674308051, 6.5)
(3323.5236092231353, 7.0)
(3321.1817292311171, 7.5)
(3318.8401274004718, 8.0)
(3317.1768408731441, 8.5)
(3319.7007466014184, 9.0)
(3323.1485003193666, 9.5)
(3326.5958636398136, 10.0)
(3330.0428366351698, 10.5)
(3333.4894193774089, 11.0)
(3336.9356119397853, 11.5)
(3340.3814143933996, 12.0)
(3343.8268268110405, 12.5)
(3347.2718492651184, 13.0)
(3350.7164818275487, 13.5)
(3354.1607245702762, 14.0)
(3354.7009008867317, 14.5)
(3355.1294737665157, 15.0)
(3355.5579501924512, 15.5)
(3355.9863301907026, 16.0)
(3356.4146137880743, 16.5)
(3356.8428010101488, 17.0)
(3357.2708918836724, 17.5)
(3357.6988864342857, 18.0)
(3358.126784689026, 18.5)
(3358.068211809441, 19

In [14]:
for i in range(0,40):
    bayesian_ridge = linear_model.Lasso(alpha = 1-i/20.0)
    bayesian_ridge.fit(x_train, y_train)
    bayesian_ridge_predictions = bayesian_ridge.predict(x_test_listprice)
    median = np.median(abs((unprocess(bayesian_ridge_predictions))-unprocess(y_test)))
    print(median, 1-i/20.0)

(61046.770500969229, 1.0)
(58751.434876406682, 0.95)
(56030.296179051162, 0.9)
(52184.324801374678, 0.85)
(48578.219325049009, 0.8)
(45393.772775971302, 0.75)
(42402.992651826484, 0.7)
(39323.185830956252, 0.65)
(36055.276695751643, 0.6)
(32403.051240164001, 0.55)
(29501.484379176633, 0.5)
(26394.122422166693, 0.44999999999999996)
(23391.193424511701, 0.4)
(20264.940001637442, 0.35)
(17349.547580657207, 0.30000000000000004)
(14365.188999693215, 0.25)
(11500.495800816105, 0.19999999999999996)
(8777.9564921767596, 0.15000000000000002)
(6449.768017185299, 0.09999999999999998)
(4367.9523624748981, 0.050000000000000044)
(3328.4539785103989, 0.0)
(11332.758809624138, -0.050000000000000044)
(22263.762925524105, -0.10000000000000009)
(32200.843657212594, -0.1499999999999999)
(42183.248053808464, -0.19999999999999996)
(52475.359479470964, -0.25)
(58885.376959582696, -0.30000000000000004)
(67683.086293300119, -0.3500000000000001)
(75758.547532147728, -0.3999999999999999)
(82361.160252564237, -0.

  app.launch_new_instance()


In [20]:
x_train[:5]

Unnamed: 0,LivingArea,NumBedrooms,NumBaths,ExteriorStories,ListPrice,GeoLat,GeoLon,Both Private & Community,Private,Apartment Style/Flat,Gemini/Twin Home,Single Family - Detached,Townhouse
0,-0.744707,0.904503,-0.080494,-0.426973,-0.323824,0.661435,-0.354584,-0.592229,-0.039833,-0.085409,-0.027647,-0.276699,-0.777024
1,0.183095,-0.221713,-0.038193,-0.426973,0.53467,0.038596,-1.172749,-0.592229,-0.039833,-0.085409,-0.027647,-0.276699,-0.777024
2,0.115537,0.904503,-0.038193,-0.426973,0.43143,0.003389,-1.185919,-0.592229,-0.039833,-0.085409,-0.027647,-0.276699,0.984511
3,1.3451,2.030719,0.131012,0.843059,1.178876,0.005641,1.160108,1.688488,-0.039833,-0.085409,-0.027647,-0.276699,-0.777024
4,-0.477476,-0.221713,-0.038193,-0.426973,-0.449216,0.60369,-1.353673,-0.592229,-0.039833,-0.085409,-0.027647,-0.276699,1.571689


In [46]:
def ridge_regression(x_train, y_train, x_test, lam):
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    
    x = np.column_stack((np.ones(len(x_train)), x_train))
    y = y_train
    
    #((XtX + lambda I)^-1 * Xt*y)
    
    xt = np.transpose(x)
    product = np.dot(xt, x)
    lambda_identity = lam*np.identity(len(xt))
    inverse = np.linalg.inv(product + lambda_identity)
    weights = np.dot(np.dot(inverse, xt), y)
    
    predictions = []
    for i in x_test:
        evaluated = sum(i*weights[1:])
        predictions.append(evaluated + weights[0])
    
    return predictions
    
ridge_predictions = ridge_regression(x_train, y_train, x_test_listprice, 8)

In [47]:
median = np.median(abs((unprocess(np.array(ridge_predictions)))-unprocess(y_test)))
median

3318.8369380600052

In [30]:
lasso = linear_model.Lasso(alpha = .01)
lasso.fit(x_train, y_train)
lasso_predictions = lasso.predict(x_test_listprice)
median = np.median(abs((unprocess(lasso_predictions))-unprocess(y_test)))
print(median)
print lasso.coef_

3498.31285086
[ 0.          0.00263044  0.         -0.          0.9840766  -0.         -0.
 -0.         -0.         -0.          0.         -0.          0.        ]


In [32]:
bayesian_ridge = linear_model.Ridge(alpha = 8)
bayesian_ridge.fit(x_train, y_train)
bayesian_ridge_predictions = bayesian_ridge.predict(x_test_listprice)
median = np.median(abs((unprocess(bayesian_ridge_predictions))-unprocess(y_test)))
bayesian_ridge.coef_

array([ -7.19259979e-03,   1.05397289e-02,   8.92793863e-04,
        -8.58661699e-04,   9.95361344e-01,  -5.54372162e-03,
        -6.33059272e-03,  -1.67210647e-03,  -5.20435984e-03,
        -1.12831245e-03,   4.23844344e-04,  -2.49722753e-03,
         5.54819505e-04])

In [5]:
bayesian_ridge = linear_model.Ridge()
bayesian_ridge.fit(x_train_stacking, y_train_stacking)
bayesian_ridge_predictions_test = bayesian_ridge.predict(x_test_listprice)

bayesian_ridge = linear_model.Ridge()
bayesian_ridge.fit(x_train_stacking, y_train_stacking)
bayesian_ridge_predictions_stacking = bayesian_ridge.predict(x_train_training)

x_train_training['Bayesian_Ridge_Predictions'] = bayesian_ridge_predictions_stacking
x_test_listprice['Bayesian_Ridge_Predictions'] = bayesian_ridge_predictions_test

booster = ensemble.GradientBoostingRegressor(n_estimators = 110)
booster.fit(x_train_training, y_train_training)
booster_predictions = booster.predict(x_test_listprice)

forest = ensemble.RandomForestRegressor(n_estimators = 100)
forest.fit(x_train_training, y_train_training)
forest_predictions = booster.predict(x_test_listprice)

median = np.median(abs((unprocess(booster_predictions)+unprocess(forest_predictions))/2.0-unprocess(y_test)))
print "Median = %r" %median

Median = 3280.039948568301


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
