In [2]:
# Database connect
import psycopg2

# DataFrames
import pandas as pd
pd.set_option('display.max_columns', 500)

# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Cross validation
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.model_selection import KFold, ShuffleSplit


# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

In [17]:
db_info = {
    'user':'app_connect',
    'database':'real_estate_data',
    'host' : 'localhost',
    'port' : 5432,
    'password' : 'flying_horse536'
}
conn = psycopg2.connect(**db_info)
cur = conn.cursor()

In [18]:
def execute_query(q):
    conn.rollback()
    cur.execute(q)
    conn.commit()
    return cur.fetchall(), cur.description

In [19]:
query = '''
    SELECT *
    FROM sales_info s
    LEFT JOIN property_info p
    ON s.major = p.major AND s.minor = p.minor
    ;
'''

In [24]:
query_result, description = execute_query(query)
column_names = [d.name for d in description]

In [25]:
df = pd.DataFrame(query_result, columns=column_names)

In [26]:
# Change 'DocumentDate' to DateTime
df['document_date'] = pd.to_datetime(df['document_date'])
# Get SaleYear and SaleMonth columns out of the DateTime object
df['sale_year'] = pd.DatetimeIndex(df['document_date']).year
df['sale_year'] -= df['sale_year'].min()
df['sale_month'] = pd.DatetimeIndex(df['document_date']).month

In [27]:
def filter_df(keep_values, col_name, df):
    mask = df[col_name].isin(keep_values)
    return df[mask]

In [28]:
'''
PrincipalUse value of 6 represents Residential buildings
Keep only sales with PrincipalUse value of 6
'''
principal_use_keep_values = [6]
df = filter_df(keep_values=principal_use_keep_values, col_name='principal_use', df=df)

'''
PropertyType value of 1 represents Land Only
PropertyType value of 2 represents Land with New Building
PropertyType value of 3 represents Land with Previously Used Building
Keep only sales with PropertyType value in [1, 2, 3]
'''
property_type_keep_values = [1, 2, 3]
df = filter_df(property_type_keep_values, 'property_type', df)

'''
SaleInstrument value of 3 represents a Statutory Warranty Deed
By using this deed, the seller promises the buyer 
1. The seller is the owner of the property and has the right to sell it
2. No one else is possessing the property
3. There are no encumbrances against the property
4. No one with a better claim to the property will interfere with the transferee’s rights
5. The seller will defend certain claims regarding title to the property
'''
sale_instrument_keep_values = [3]
df = filter_df(sale_instrument_keep_values, 'sale_instrument', df)

In [29]:
# Shuffle dataframe
df = shuffle(df)
df.fillna(0, inplace=True)

In [30]:
# Setting up features for model:

# Features:
sales_feature = [
    'property_type',
    'sale_reason',
    'property_class',
    'sale_year',
    'sale_month'
]

property_features = [
    'nbr_living_units', 
    'stories',
    'bldg_grade',
    'bldg_grade_var', 
    'sq_ft_1st_floor',
    'sq_ft_half_floor',
    'sq_ft_2nd_floor',
    'sq_ft_upper_floor',
    'sq_ft_unfin_full',
    'sq_ft_unfin_half',
    'sq_ft_tot_living',
    'sq_ft_tot_basement',
    'sq_ft_fin_basement',
    'fin_basement_grade',
    'sq_ft_garage_basement',
    'sq_ft_garage_attached'
]

combined_features = sales_feature + property_features

features_as_is = [
    'sale_year',
    'sale_month'
]

features_to_scale = [
    'sq_ft_1st_floor',
    'sq_ft_half_floor',
    'sq_ft_2nd_floor',
    'sq_ft_upper_floor',
    'sq_ft_unfin_full',
    'sq_ft_unfin_half',
    'sq_ft_tot_living',
    'sq_ft_tot_basement',
    'sq_ft_fin_basement',
    'sq_ft_garage_basement',
    'sq_ft_garage_attached'
]

dummy_features = [
    'property_type',
    'sale_reason',
    'property_class',
    'nbr_living_units', 
    'stories',
    'bldg_grade',
    'bldg_grade_var', 
]


# Standardize the dataframe
scalar = StandardScaler().fit(df[features_to_scale])
df[features_to_scale] = scalar.transform(df[features_to_scale])

final_df = df[features_as_is + features_to_scale + ['sale_price']]

# Get dummy cols
dummies = pd.get_dummies(df[dummy_features].applymap(str))
final_df.append(dummies)


# Target value: SalePrice
y = final_df.pop('sale_price')

X = final_df



In [34]:
# Train Test Split Score
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Random Forest
model_RF = RandomForestRegressor()
model_RF.fit(X_train, y_train)
print("Random Forest Score: {}".format(model_RF.score(X_test, y_test)))

# Gradient Booster
model_GB = GradientBoostingRegressor()
model_GB.fit(X_train, y_train)
print("Gradient Booster Score: {}".format(model_GB.score(X_test, y_test)))

Random Forest Score: 0.392971122328106
Gradient Booster Score: 0.08498579234056958


In [None]:
#### Cross Val Score
model_cv_RF = RandomForestRegressor(n_jobs=-1)
print(cross_val_score(model_cv_RF, X, y, cv=3).mean())