In [None]:
# Notebook to test running 5 different models on the transformed data
# Will convert to a single script

In [2]:
# # Create the actual training data
# import transform

# transform.process_data('data/train.csv','data/test.csv')

In [19]:
# Libraries
import sys
import pandas as pd
import numpy as np

# ML
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

# View all cols in jupyter notebook - CAN DROP
from IPython.display import display
pd.options.display.max_columns = None

In [47]:
# Load the data
# Training data - has a saleprice column
train = pd.read_csv('p_train.csv')        # replace this with sys.argv[1]
train.set_index('Id', inplace=True)

train_prices = pd.read_csv('prices.csv')
train_prices.set_index('Id', inplace=True)

# Test data - doesn't have a saleprice column
to_guess = pd.read_csv('p_test.csv')      # replace this with sys.argv[2]
to_guess.set_index('Id', inplace=True)

In [None]:
# Open all of the pickles to get the params
import pickle

# Linear
with open('linear_params.pkl', 'rb') as f:
    param_dict_linear = pickle.load(f)
    
model_linear = LinearRegression(**param_dict_linear)

# RF1
with open('rf1_params.pkl', 'rb') as f:
    param_dict_rf1 = pickle.load(f)

model_rf1 = RandomForestRegressor(**param_dict_rf1)

# RF2
with open('rf2_params.pkl', 'rb') as f:
    param_dict_rf2 = pickle.load(f)

model_rf2 = RandomForestRegressor(**param_dict_rf2)

# XGBoost
with open('xgb_params.pkl', 'rb') as f:
    param_dict_xgb = pickle.load(f)

model_xgb = RandomForestRegressor(**param_dict_xgb)

# GradientBoosting
with open('gb_params.pkl', 'rb') as f:
    param_dict_gb = pickle.load(f)

model_gb = RandomForestRegressor(**param_dict_gb)

In [52]:
# Split the training set into 5 folds
kf = KFold(n_splits=5)

# keep the indicies of each train and test set in a nested array
train_indicies = []
test_indicies = []
for train_index, test_index in kf.split(train):
    train_indicies = train_indicies + [train_index]
    test_indicies = test_indicies + [test_index] 
    
# train.iloc[train_indicies[0]]             <-- first 80% train features
# train_prices.iloc[train_indicies[0]]      <-- first 80% train labels
# train.iloc[test_indicies[0]].shape        <-- first 20% test features
# train_prices.iloc[test_indicies[0]].shape <-- first 20% test labels

In [153]:
# Linear Regression
# Data frame to store results
linear_results = pd.DataFrame({"Id":train_prices.index, "actual":train_prices.log_SalePrice})
linear_results.set_index('Id',inplace=True)

# iterate through the 5 folds to create the first 5 models and gen results
for i in range(0,5):
    model_linear.fit(train.iloc[train_indicies[i]], train_prices.iloc[train_indicies[i]])  # model on 80%
    pred = model_linear.predict(train.iloc[test_indicies[i]])                              # predict on 20%
    pred = [i[0] for i in pred]    
    model_name = "model" + str(i+1)
    temp_df = pd.DataFrame({'Id':test_indicies[i], model_name:pred})    # temp DF with ID and new model results
    temp_df.set_index('Id', inplace=True)
    linear_results = linear_results.merge(temp_df, on='Id', how='left') # add to linear_results DF

# create a 6th model - train on all training, predict on all test ('to_guess')
model_linear.fit(train, train_prices)
pred = model_linear.predict(to_guess)
pred = [i[0] for i in pred]
temp_df = pd.DataFrame({'Id':to_guess.index, "model6":pred})
temp_df.set_index('Id', inplace=True)
linear_results = linear_results.merge(temp_df, on='Id', how='outer') # add to linear_results DF

# reorder cols (actual at the end)
linear_results = linear_results[['model1','model2','model3','model4','model5','model6','actual']]

# output to a csv
linear_results.to_csv('model_outputs/linear_results.csv')

In [180]:
# Random Forest 1
# Data frame to store results
rf1_results = pd.DataFrame({"Id":train_prices.index, "actual":train_prices.log_SalePrice})
rf1_results.set_index('Id',inplace=True)

# iterate through the 5 folds to create the first 5 models and gen results
for i in range(0,5):
    model_rf1.fit(train.iloc[train_indicies[i]], train_prices.iloc[train_indicies[i]].log_SalePrice)  # model on 80%
    pred = model_rf1.predict(train.iloc[test_indicies[i]])                              # predict on 20%
    pred = [i for i in pred]  
    model_name = "model" + str(i+1)
    temp_df = pd.DataFrame({'Id':test_indicies[i], model_name:pred})    # temp DF with ID and new model results
    temp_df.set_index('Id', inplace=True)
    rf1_results = rf1_results.merge(temp_df, on='Id', how='left') # add to rf1_results DF

# create a 6th model - train on all training, predict on all test ('to_guess')
model_rf1.fit(train, train_prices.log_SalePrice)
pred = model_rf1.predict(to_guess)
pred = [i for i in pred]
temp_df = pd.DataFrame({'Id':to_guess.index, "model6":pred})
temp_df.set_index('Id', inplace=True)
rf1_results = rf1_results.merge(temp_df, on='Id', how='outer') # add to rf1_results DF

# # reorder cols (actual at the end)
rf1_results = rf1_results[['model1','model2','model3','model4','model5','model6','actual']]

# # output to a csv
rf1_results.to_csv('model_outputs/rf1_results.csv')

In [182]:
# Random Forest 2
# Data frame to store results
rf2_results = pd.DataFrame({"Id":train_prices.index, "actual":train_prices.log_SalePrice})
rf2_results.set_index('Id',inplace=True)

# iterate through the 5 folds to create the first 5 models and gen results
for i in range(0,5):
    model_rf2.fit(train.iloc[train_indicies[i]], train_prices.iloc[train_indicies[i]].log_SalePrice)  # model on 80%
    pred = model_rf2.predict(train.iloc[test_indicies[i]])                              # predict on 20%
    pred = [i for i in pred]  
    model_name = "model" + str(i+1)
    temp_df = pd.DataFrame({'Id':test_indicies[i], model_name:pred})    # temp DF with ID and new model results
    temp_df.set_index('Id', inplace=True)
    rf2_results = rf2_results.merge(temp_df, on='Id', how='left') # add to rf2_results DF

# create a 6th model - train on all training, predict on all test ('to_guess')
model_rf2.fit(train, train_prices.log_SalePrice)
pred = model_rf2.predict(to_guess)
pred = [i for i in pred]
temp_df = pd.DataFrame({'Id':to_guess.index, "model6":pred})
temp_df.set_index('Id', inplace=True)
rf2_results = rf2_results.merge(temp_df, on='Id', how='outer') # add to rf2_results DF

# # reorder cols (actual at the end)
rf2_results = rf2_results[['model1','model2','model3','model4','model5','model6','actual']]

# # output to a csv
rf2_results.to_csv('model_outputs/rf2_results.csv')

In [184]:
# XGBoost
# Data frame to store results
xgb_results = pd.DataFrame({"Id":train_prices.index, "actual":train_prices.log_SalePrice})
xgb_results.set_index('Id',inplace=True)

# iterate through the 5 folds to create the first 5 models and gen results
for i in range(0,5):
    model_xgb.fit(train.iloc[train_indicies[i]], train_prices.iloc[train_indicies[i]].log_SalePrice)  # model on 80%
    pred = model_xgb.predict(train.iloc[test_indicies[i]])                              # predict on 20%
    pred = [i for i in pred]  
    model_name = "model" + str(i+1)
    temp_df = pd.DataFrame({'Id':test_indicies[i], model_name:pred})    # temp DF with ID and new model results
    temp_df.set_index('Id', inplace=True)
    xgb_results = xgb_results.merge(temp_df, on='Id', how='left') # add to xgb_results DF

# create a 6th model - train on all training, predict on all test ('to_guess')
model_xgb.fit(train, train_prices.log_SalePrice)
pred = model_xgb.predict(to_guess)
pred = [i for i in pred]
temp_df = pd.DataFrame({'Id':to_guess.index, "model6":pred})
temp_df.set_index('Id', inplace=True)
xgb_results = xgb_results.merge(temp_df, on='Id', how='outer') # add to xgb_results DF

# # reorder cols (actual at the end)
xgb_results = xgb_results[['model1','model2','model3','model4','model5','model6','actual']]

# # output to a csv
xgb_results.to_csv('model_outputs/xgb_results.csv')

In [186]:
# GradientBoost
# Data frame to store results
gb_results = pd.DataFrame({"Id":train_prices.index, "actual":train_prices.log_SalePrice})
gb_results.set_index('Id',inplace=True)

# iterate through the 5 folds to create the first 5 models and gen results
for i in range(0,5):
    model_gb.fit(train.iloc[train_indicies[i]], train_prices.iloc[train_indicies[i]].log_SalePrice)  # model on 80%
    pred = model_gb.predict(train.iloc[test_indicies[i]])                              # predict on 20%
    pred = [i for i in pred]  
    model_name = "model" + str(i+1)
    temp_df = pd.DataFrame({'Id':test_indicies[i], model_name:pred})    # temp DF with ID and new model results
    temp_df.set_index('Id', inplace=True)
    gb_results = gb_results.merge(temp_df, on='Id', how='left') # add to gb_results DF

# create a 6th model - train on all training, predict on all test ('to_guess')
model_gb.fit(train, train_prices.log_SalePrice)
pred = model_gb.predict(to_guess)
pred = [i for i in pred]
temp_df = pd.DataFrame({'Id':to_guess.index, "model6":pred})
temp_df.set_index('Id', inplace=True)
gb_results = gb_results.merge(temp_df, on='Id', how='outer') # add to gb_results DF

# # reorder cols (actual at the end)
gb_results = gb_results[['model1','model2','model3','model4','model5','model6','actual']]

# # output to a csv
gb_results.to_csv('model_outputs/gb_results.csv')