In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline

In [94]:
def create_dummies(df, list_fields):
    df_new = df
    for field in list_fields:
        df_field = pd.get_dummies(df[field],prefix=field)
        df_new = pd.concat([df_new, df_field], axis=1)
        df_new.drop(field,axis=1,inplace=True)
    return df_new

def convert_timestamp(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = (df['timestamp']-df['timestamp'].min()).dt.days
    return df

In [96]:
train = pd.read_csv('train.csv')
df_new = create_dummies(train,['sub_area','product_type','ecology'])
df_new = convert_timestamp(df_new)

test = pd.read_csv('test.csv')
df_new_test = create_dummies(test,['sub_area','product_type','ecology'])
df_new_test = convert_timestamp(df_new_test)

columns_to_keep = np.intersect1d(df_new.columns,df_new_test.columns)

In [97]:
df_new = df_new[np.append(columns_to_keep,'price_doc')]
df_new_test = df_new_test[columns_to_keep]

In [98]:
#Rough cleaning
cleaned = df_new.fillna(df_new.mean())

In [99]:
X_train = cleaned.drop('price_doc',axis=1)
y_train = cleaned['price_doc']

In [100]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.cross_validation import ShuffleSplit


In [101]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true,y_predict)
    
    # Return the score
    return score

In [102]:
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    # sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
    # sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': range(1,11)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search object
    grid = GridSearchCV(estimator=regressor,param_grid=params,scoring=scoring_fnc,cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [103]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for 'max_depth'
print "Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth'])

Parameter 'max_depth' is 7 for the optimal model.


In [104]:
X_test = df_new_test.fillna(df_new_test.mean())

In [105]:
predictions = reg.predict(X_test)
test['price_doc'] = predictions

In [106]:
submission = test[['id','price_doc']]
submission.set_index('id',inplace=True)
submission.head()

Unnamed: 0_level_0,price_doc
id,Unnamed: 1_level_1
30474,5495062.0
30475,6554423.0
30476,5495062.0
30477,5974426.0
30478,5495062.0


In [107]:
submission.to_csv('submission1.csv')

In [70]:
predictions[0]

3927348.1556256572

In [18]:
print cleaned.columns

Index([u'id', u'full_sq', u'life_sq', u'floor', u'max_floor', u'material',
       u'build_year', u'num_room', u'kitch_sq', u'state',
       ...
       u'cafe_count_5000_price_4000', u'cafe_count_5000_price_high',
       u'big_church_count_5000', u'church_count_5000', u'mosque_count_5000',
       u'leisure_count_5000', u'sport_count_5000', u'market_count_5000',
       u'price_doc', u'Unnamed: 292'],
      dtype='object', length=289)


In [108]:
#RMLSE
predictions2 = reg.predict(X_train)
epsilon = 0
n=len(y_train)
for i in range(0,n):
    epsilon += (np.log(predictions2[i]+1)-np.log(y_train.iloc[i]+1))**2
epsilon = epsilon/n
epsilon = np.sqrt(epsilon)
print(epsilon)

0.480068737987
