In [2]:
import pandas as pd
import numpy as np
import zipfile
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import linear_model
from sklearn import svm
from collections import Counter
import sklearn.cross_validation
import math
import time
from scipy import stats
from sklearn.linear_model import RandomizedLasso
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import sys
%matplotlib inline

In [3]:
filename = "data_sci_snippet.csv.zip"
zf = zipfile.ZipFile(filename, 'r')
print zf.namelist()

['data_sci_snippet.csv']


In [4]:
def time_preprocessing(df, column_name):
    
    times = []
    count = 0
    start = time.time()
    for i in df[column_name]:
        count += 1
        if count % 100000 == 0:
            time_elapsed = time.time() - start
            #print "Count = %r, Time Elapsed = %r" %(count, time_elapsed)
        times.append(time.strptime(i, "%Y-%m-%d"))

    year = []
    month = []
    day = []
    day_of_week = []
    day_in_year = []

    for i in times:
        year.append(i[0])
        month.append(i[1])
        day.append(i[2])
        day_of_week.append(i[6])
        day_in_year.append(i[7])
    df[column_name + ' year'] = year
    df[column_name + ' month'] = month
    df[column_name + ' day'] = day
    df[column_name + ' day_of_week'] = day_of_week
    df[column_name + ' day_in_year'] = day_in_year
    df.drop([column_name], axis=1)
    return df

In [5]:
data = pd.read_csv(zf.open('data_sci_snippet.csv'))

one_hot_columns = ['Pool','ListingStatus','DwellingType']
count = 0
for i in one_hot_columns:
    dummies = pd.get_dummies(data[i])
    for j in dummies:
        data[j] = dummies[j]
        count += 1

data = time_preprocessing(data, 'ListDate')
data = data.dropna()

# Note that we won't have closedate in our test data
# Could still build a model for closing date and use that as a feature
#data = time_preprocessing(data, 'CloseDate')
data = data.drop(['CloseDate'], 1)

# Remove Outliers
data = data[(data['ClosePrice'] > 10000)]
data = data[(data['ClosePrice'] < 500000)]
data = data[(data['ListPrice'] <= 7000000)]

data = data.drop(['Pool', 'ListingStatus', 'DwellingType', 'ListDate', 'PublicRemarks', 'ListDate year'], axis=1)

data['ListPrice'] = np.log(data['ListPrice'])
data['ClosePrice'] = np.log(data['ClosePrice'])
preprocessing_array = []
for i in data:
    if i == 'ClosePrice':
        closeprice_mean = data[i].mean()
        closeprice_std = data[i].std()
    preprocessing_array.append([i, data[i].mean(), data[i].std()])
    data[i] = data[i] - data[i].mean()
    data[i] = data[i]/float(data[i].std())

def unprocess(data, mean = closeprice_mean, std = closeprice_std):
    return np.exp(data*closeprice_std+closeprice_mean)

y = data['ClosePrice']
x = data.drop(['ClosePrice'], 1)
columns = data.columns
x_train, x_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    x, y, test_size=0.02, random_state=14)

data = pd.DataFrame(x_train, columns=columns[:-1])
data['ClosePrice'] = y_train
test_data = pd.DataFrame(x_test, columns=columns[:-1])
test_data['ClosePrice'] = y_test

full = pd.DataFrame()
for_test = pd.DataFrame()
for i in data:
    for j in data:
        if i != 'ClosePrice' and j != 'ClosePrice':
            full[i + '*' + j] = data[i]*data[j]
            for_test[i + '*' + j] = test_data[i]*test_data[j]
full['ClosePrice'] = data['ClosePrice']
correlations = full.corrwith(full['ClosePrice'])
columns = full.columns
for i in range(len(correlations)):
    if abs(list(correlations)[i]) > .3:
        data[columns[i]] = full[columns[i]]
        if columns[i] != 'ClosePrice':
            test_data[columns[i]] = for_test[columns[i]]

stacking_data = data[:len(data)/2]
training_data = data[len(data)/2:]

x_train_stacking = stacking_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
                'ListPrice*ListPrice',
                 'ListPrice*Single Family - Detached',
                 'Community*None',
                 'None*Community',
                 'Private*Private',
                 'Apartment Style/Flat*Mfg/Mobile Housing',
                 'Apartment Style/Flat*Townhouse',
                 'Mfg/Mobile Housing*Apartment Style/Flat',
                 'Mfg/Mobile Housing*Townhouse',
                 'Single Family - Detached*ListPrice',
                 'Single Family - Detached*Single Family - Detached',
                 'Townhouse*Apartment Style/Flat',
                 'Townhouse*Mfg/Mobile Housing'
               ]]
y_train_stacking = stacking_data['ClosePrice']

x_train_training = training_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
                     'ListPrice*ListPrice',
                     'ListPrice*Single Family - Detached',
                     'Community*None',
                     'None*Community',
                     'Private*Private',
                     'Apartment Style/Flat*Mfg/Mobile Housing',
                     'Apartment Style/Flat*Townhouse',
                     'Mfg/Mobile Housing*Apartment Style/Flat',
                     'Mfg/Mobile Housing*Townhouse',
                     'Single Family - Detached*ListPrice',
                     'Single Family - Detached*Single Family - Detached',
                     'Townhouse*Apartment Style/Flat',
                     'Townhouse*Mfg/Mobile Housing'
               ]]
y_train_training = training_data['ClosePrice']

x_test_listprice = test_data[[
                    'LivingArea',
                    'NumBedrooms',
                    'NumBaths',
                    'ExteriorStories',
                    'ListPrice',
                    'GeoLat',
                    'GeoLon',
                    'Both Private & Community',
                    'Private',
                    'Apartment Style/Flat',
                    'Gemini/Twin Home',
                    'Single Family - Detached',
                    'Townhouse',
                     'ListPrice*ListPrice',
                     'ListPrice*Single Family - Detached',
                     'Community*None',
                     'None*Community',
                     'Private*Private',
                     'Apartment Style/Flat*Mfg/Mobile Housing',
                     'Apartment Style/Flat*Townhouse',
                     'Mfg/Mobile Housing*Apartment Style/Flat',
                     'Mfg/Mobile Housing*Townhouse',
                     'Single Family - Detached*ListPrice',
                     'Single Family - Detached*Single Family - Detached',
                     'Townhouse*Apartment Style/Flat',
                     'Townhouse*Mfg/Mobile Housing'
                   ]]
y_test = test_data['ClosePrice']

bayesian_ridge = linear_model.Ridge()
bayesian_ridge.fit(x_train_stacking, y_train_stacking)
bayesian_ridge_predictions_test = bayesian_ridge.predict(x_test_listprice)

bayesian_ridge = linear_model.Ridge()
bayesian_ridge.fit(x_train_stacking, y_train_stacking)
bayesian_ridge_predictions_stacking = bayesian_ridge.predict(x_train_training)

x_train_training['Bayesian_Ridge_Predictions'] = bayesian_ridge_predictions_stacking
x_test_listprice['Bayesian_Ridge_Predictions'] = bayesian_ridge_predictions_test

booster = ensemble.GradientBoostingRegressor(n_estimators = 110)
booster.fit(x_train_training, y_train_training)
booster_predictions = booster.predict(x_test_listprice)

forest = ensemble.RandomForestRegressor(n_estimators = 100)
forest.fit(x_train_training, y_train_training)
forest_predictions = booster.predict(x_test_listprice)

median = np.median(abs((unprocess(booster_predictions)+unprocess(forest_predictions))/2.0-unprocess(y_test)))
print "Median = %r" %median

Median = 3280.039948568301


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
