In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
%matplotlib inline

In [2]:
def create_dummies(df, list_fields):
    df_new = df
    for field in list_fields:
        df_field = pd.get_dummies(df[field],prefix=field)
        df_new = pd.concat([df_new, df_field], axis=1)
        df_new.drop(field,axis=1,inplace=True)
    return df_new

def convert_timestamp(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day
    df['timestamp'] = (df['timestamp']-df['timestamp'].min()).dt.days
    return df

In [21]:
train = pd.read_csv('train.csv')
df_new = create_dummies(train,['product_type','ecology'])
df_new.drop('sub_area',axis=1,inplace=True)
df_new = convert_timestamp(df_new)

test = pd.read_csv('test.csv')
df_new_test = create_dummies(test,['product_type','ecology'])
df_new_test.drop('sub_area',axis=1,inplace=True)
df_new_test = convert_timestamp(df_new_test)

columns_to_keep = np.intersect1d(df_new.columns,df_new_test.columns)

In [22]:
df_new = df_new[np.append(columns_to_keep,'price_doc')]
df_new_test = df_new_test[columns_to_keep]

In [43]:
#Rough cleaning
cleaned = df_new.fillna(df_new.mean())[10000:]

In [53]:
from sklearn.model_selection import train_test_split

X_cleaned = cleaned.drop('price_doc',axis=1)
y_cleaned = cleaned['price_doc']

X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=0)

In [38]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.cross_validation import ShuffleSplit


In [39]:
from sklearn.metrics import mean_squared_error, r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Compute the RMSLE 
    score = np.sqrt(np.mean((np.log(y_true+1)-np.log(y_predict+1))**2))
    
    # Return the score
    return score

In [54]:
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
    
    regressor = RandomForestRegressor(n_estimators=20)

    # Create grid parameters dictionary
    #params = {'n_estimators': [5, 10, 50, 100], 'max_depth': range(1,11), 'min_samples_split': [2, 3, 5]}
    params = {'max_depth': range(11,15)}
    
    # Transform 'performance_metric' into a scoring function
    scoring_fnc = make_scorer(performance_metric, greater_is_better=False)

    # Create the grid search object
    grid = GridSearchCV(estimator=regressor,param_grid=params,scoring=scoring_fnc,cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [55]:
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)

# Produce the value for 'max_depth'
print "Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth'])

Parameter 'max_depth' is 13 for the optimal model.


In [57]:
X_test_final = df_new_test.fillna(df_new_test.mean())

In [58]:
predictions = reg.predict(X_test_final)
test['price_doc'] = predictions

In [59]:
submission = test[['id','price_doc']]
submission.set_index('id',inplace=True)
submission.head()

Unnamed: 0_level_0,price_doc
id,Unnamed: 1_level_1
30474,5417854.0
30475,8222329.0
30476,5991031.0
30477,6713476.0
30478,5044535.0


In [52]:
submission.to_csv('submission1.csv')

In [55]:
predictions[0]

5449646.19298077

In [None]:
print cleaned.columns

In [60]:
# RMLSE
predictions2 = reg.predict(X_test)
epsilon = np.sqrt(np.mean((np.log(predictions2+1)-np.log(y_test+1))**2))
print(epsilon)

0.459370433866


In [49]:
from sklearn.decomposition import PCA

pca = PCA(n_components=200)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)
reg = fit_model(X_train_pca, y_train)
print "Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth'])

Parameter 'max_depth' is 10 for the optimal model.


In [50]:
# RMLSE
X_test_pca = pca.transform(X_test)
predictions2 = reg.predict(X_test_pca)
epsilon = np.sqrt(np.mean((np.log(predictions2+1)-np.log(y_test+1))**2))
print(epsilon)

0.470218760232
