# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn 

# Import data and data preprocessing

In [2]:
df = pd.read_csv('kc_house_data.csv')
df=pd.DataFrame(df)
df['date']  = pd.to_datetime(df['date'])
year = df['date'].dt.year
for x in range(21613): 
    if df['yr_renovated'][x] == 0:
        yr_actual_point = df['yr_built']
    else:
        yr_actual_point = df['yr_renovated']
df['age'] = year - yr_actual_point 
df['is_renovated']=df['yr_renovated'].apply(lambda x : 0 if x==0 else 1)
df = df[['sqft_living', 'floors', 'bedrooms', 'bathrooms','grade','is_renovated','waterfront','view','zipcode','age','price']]
df = pd.get_dummies(df,columns=['zipcode'])
X = df[df.columns.difference(['price','zipcode_98001'])]
y = df['price']

# train and test split

In [3]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,random_state=0)



# Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
np.sqrt(mse)

152160.96783104105

# Decision Tree

In [5]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
np.sqrt(mse)

182232.62158711778

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(criterion='mse',n_jobs=1)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
np.sqrt(mse)

139991.565802109

# XGBoost

In [7]:
from xgboost import XGBRegressor
xg= XGBRegressor()
xg.fit(X_train,y_train)
y_pred=xg.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
np.sqrt(mse)

146613.83602468873

In [8]:
# # A host of Scikit-learn models
# from sklearn.linear_model import LinearRegression
# # from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
# # from sklearn.neighbors import KNeighborsClassifier
# # from sklearn.neural_network import MLPClassifier
# # from sklearn.kernel_approximation import Nystroem
# # from sklearn.kernel_approximation import RBFSampler
# # from sklearn.pipeline import make_pipeline 

In [9]:
# def get_models():
#     """Generate a library of base learners."""
# #     SEED = 123
# #     nb = GaussianNB()
# #     svc = SVC(C=100, probability=True)
# #     knn = KNeighborsClassifier(n_neighbors=3)
# #     lr = LogisticRegression(C=100, random_state=SEED)
# #     nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
# #     gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
# #     rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
#     regressor = LinearRegression()
#     rf = RandomForestRegressor(max_depth=15)    
#     xg= XGBRegressor()
    
    
    
    
#     models = {'linear regression' : regressor,
#              'random forest': rf,
#              'xgboost': xg}
        
# #     models = {'svm': svc,
# #               'knn': knn,
# #               'naive bayes': nb,
# #               'mlp-nn': nn,
# #               'random forest': rf,
# #               'gbm': gb,
# #               'logistic': lr,
# #               }

#     return models


# def train_predict(model_list):
#     """Fit models in list on training set and return preds"""
#     P = np.zeros((y_test.shape[0], len(model_list)))
#     P = pd.DataFrame(P)

#     print("Fitting models.")
#     cols = list()
#     for i, (name, m) in enumerate(models.items()):
#         print("%s..." % name, end=" ", flush=False)
#         m.fit(X_train, y_train)
#         P.iloc[:, i] = m.predict(X_test)
#         cols.append(name)
#         print("done")

#     P.columns = cols
#     print("Done.\n")
#     return P


# def score_models(P, y):
#     """Score model in prediction DF"""
#     print("Scoring models.")
#     for m in P.columns:
#         score = np.sqrt(mean_squared_error(y, P.loc[:, m]))
#         print("%-26s: %.3f" % (m, score))
#     print("Done.\n")

In [10]:
# models = get_models()
# P = train_predict(models)
# score_models(P, y_test)

# Gradient Boost

In [14]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(alpha= 0.9,learning_rate= 0.1,loss='ls',max_depth=15,n_estimators=100,min_samples_split=30,max_features=10,random_state=0)
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
np.sqrt(mse)

119030.4120771039

# k- fold

In [12]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = gb, X = X, y = y, cv = 10)
accuracies.mean()
# accuracies.std()

0.8540109306988176

# Grid Search

In [None]:
# from sklearn.model_selection import GridSearchCV
# parameters = [{'loss':['ls'],'learning_rate':[0.1],'n_estimators':[100],
#               'max_depth':[10],'alpha':[0.9]},
#              {'loss':['ls'],'learning_rate':[0.1],'n_estimators':[100],
#               'max_depth':[15],'alpha':[0.9]}]
# grid_search = GridSearchCV(estimator = gb,
#                            param_grid = parameters,
#                            cv = 10,
#                            n_jobs = -1)
# grid_search = grid_search.fit(X_train, y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_


In [None]:
# best_accuracy

In [None]:
# best_parameters