In [67]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, balanced_accuracy_score, r2_score, mean_squared_log_error
from pandas.plotting import scatter_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn.feature_selection import SelectKBest, f_regression, chi2, f_classif

In [187]:
def regress_feature_selection_transformation(X, y, target, ordinal_feature, route):
    
    def train_num_feature_selection(X_num):
        # saleprice correlation matrix
        k_num = round(len(X_num.columns) / 2)
        corrmat = X_num.corr()
        X_num_fs = corrmat.nlargest(k_num, target)[target].index
    
        # check for multicollinearity
        # if two features are strongly correlated with each other (>= 0.7) 
        # the feature with the lower correlation with the target variable is dropped
        multicorr = {}
        k = len(corrmat)
        for feature in corrmat:
            i = 1
            if feature != target:
                while i < k - 1:
                    if corrmat[feature][i] >= 0.7 and feature != corrmat.index[i]:
                        multicorr[feature] = corrmat.index[i], corrmat[feature][i]
                    i = i + 1
        
        # delete duplicates
        corr_scores = []
        for feature in list(multicorr.keys()):
            if multicorr[feature][1] in corr_scores:
                del multicorr[feature]
            else:
                corr_scores.append(multicorr[feature][1])
        
        dropped_features = []
        # remove the feature with the lower correlation coefficient (pearson) 
        for feature1, feature2 in multicorr.items():
            if corrmat[target][feature1] < corrmat[target][feature2[0]]:
                dropped_features.append(feature1)
            else:
                dropped_features.append(feature2[0])
    
        # drop the features from X_num dataframe
        for feature in X_num:
            if feature in dropped_features:
                X_num = X_num.drop(feature, axis = 1)
        drop_corr_features = X_num.columns.difference(X_num_fs)
        X_num.drop(X_num.columns.difference(X_num_fs), 1, inplace = True)
        
        return (X_num, dropped_features, drop_corr_features)
    
    def train_cat_feature_selection(X_cat, X_cat_enc):
        # feature selection on categorical data
        k_cat = round(len(X_cat.columns) / 2)
        fs = SelectKBest(f_classif, k_cat)
        fs.fit(X_cat_enc, y) # save!!
        X_cat_fs = fs.transform(X_cat_enc)
        X_cat_enc = pd.DataFrame(X_cat_fs)
        
        return (X_cat_enc, fs)
    
    def predict_num_feature_selection(X_num):
        with open('fs_values.pkl', 'rb') as file:
            drop_corr_features, drop_multicoll_features = pickle.load(file)[:2]
            
        X_num = X_num.drop(drop_multicoll_features, axis = 1) 
        X_num.drop(drop_corr_features, 1, inplace = True)
        
        return X_num
    
    def predict_cat_feature_selection(X_cat_enc):
        with open('fs_values.pkl', 'rb') as file:
            selected_cat_features, dummy = pickle.load(file)[2:4]
            
        X_cat_fs = selected_cat_features.transform(X_cat_enc)
        X_cat_enc = pd.DataFrame(X_cat_fs)
        
        return X_cat_enc
    
    # split features
    num_features = []
    cat_features = []
    for feature in X:
        if X[feature].dtypes == np.int or X[feature].dtypes == np.float:
            num_features.append(feature)
        else:
            cat_features.append(feature)
    
    X = X.reset_index()

    # impute using only numerical features
    if num_features:
        imp = IterativeImputer(max_iter = 10, random_state = 42)
        imp.fit(X[num_features])
        X[num_features] = imp.transform(X[num_features])
        X_num = X.drop(cat_features, axis = 1)
    
    # impute using only categorical features
    if cat_features:
        imp = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
        X[cat_features] = imp.fit_transform(X[cat_features].astype(str))
        X_cat = X.drop(num_features, axis = 1)
        
    # get column count
    if num_features:
        num_shape = X_num.shape[1]
    
    if cat_features:
        cat_shape = X_cat.shape[1]
    
    # feature selection numerical features
    if route == '/train':
        if num_features and not num_shape <= 10:
            X_num, drop_multicoll_features, drop_corr_features = train_num_feature_selection(X_num)
        else:
            drop_multicoll_features = []
            drop_corr_features = []
    elif route == '/predict':
        if num_features and not num_shape <= 10:
            X_num = predict_num_feature_selection(X_num)
        
    # encode ordinal features (dummy variables)
    if ordinal_feature is not None and ordinal_feature in X_num:
        ord_data = [ordinal_feature]
        X_num = pd.get_dummies(X_num, columns = ord_data, drop_first = True)

    # encode categorical features
    if cat_features:
        enc = OrdinalEncoder()
        enc.fit(X_cat)
        X_cat_enc = enc.transform(X_cat)
    
    # feature selection caterorical features
    if route == '/train':
        if cat_features and not cat_shape <= 10:
            X_cat_enc, selected_cat_features = train_cat_feature_selection(X_cat, X_cat_enc)
        else:
            selected_cat_features = None
    elif route == '/predict':
        if cat_features and not cat_shape <= 10:
            X_cat_enc = predict_cat_feature_selection(X_cat_enc)
    
    # concatenate numerical and categorical features
    if cat_features and num_features:
        df_cat = pd.DataFrame(X_cat_enc, index = list(range(len(X.index))))
        df_num = pd.DataFrame(X_num, index = list(range(len(X.index))))
        X = pd.concat([df_cat, df_num], axis = 1, sort = False)
        if route == '/train':
            X = X.drop([target], axis = 1)
    elif cat_features:
        X = pd.DataFrame(X_cat_enc)
    elif num_features:
        X = pd.DataFrame(X_num)
    
    if route == '/train':
        # serialize feature selection values
        fs_values = [drop_corr_features, drop_multicoll_features, selected_cat_features, ordinal_feature, target]
        with open('fs_values.pkl', 'wb') as file:
            pickle.dump(fs_values, file)
        
    return X

In [127]:
def predict_randomforestregress(X, y):
    # build model
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    rf = RandomForestRegressor(n_estimators = 800, min_samples_split = 2, min_samples_leaf = 1, 
      max_features = 'log2', max_depth = 70, bootstrap = False)
    rf.fit(X_train, y_train)
    
    # quantify quality of prediction
    y_predict = rf.predict(X_test)
    r_2_score = r2_score(y_test, y_predict)
    rmsle = math.sqrt(mean_squared_log_error(y_test, y_predict))
    
    ret_stmt = 'R^2 Score: ' + str(r_2_score) + '\n' + 'RMSLE: ' + str(rmsle)
    
    # save model on disk
    with open('model.pkl', 'wb') as file:
        pickle.dump(rf, file)

    return ret_stmt

In [191]:
def train():
    # train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/house-prices-advanced-regression-techniques" +
    #                                           "/train.csv"))
    train_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/train_Xnum_ynum.csv"))
    
    # select features and target variable
    target = 'SalePrice'
    ordinal_feature = 'OrdinalQual'
    route = '/train'
    
    features = list(train_data)
    X = train_data[features]
    y = train_data[target]

    feature_engineering = regress_feature_selection_transformation(X, y, target, ordinal_feature, route)
    prediction = predict_randomforestregress(feature_engineering, y)
    return prediction

In [192]:
train()

'R^2 Score: 0.9878842911019027\nRMSLE: 0.045640564820385025'

In [189]:
def predict():
    # test_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/house-prices-advanced-regression-techniques" +
    #                                            "/test.csv"))
    test_data = pd.read_csv(os.path.expanduser("~/Desktop/Projects/api/data/miscellaneous/test_Xnum_ynum.csv"))
    
    route = '/predict'
    
    with open('fs_values.pkl', 'rb') as file:
        ordinal_feature, target = pickle.load(file)[3:]
    with open('model.pkl', 'rb') as file:
        model = pickle.load(file)

    feature_engineering = regress_feature_selection_transformation(test_data, None, None, ordinal_feature, route)
    y_predict = model.predict(feature_engineering)
    
    df_features = pd.DataFrame(test_data)
    df_prediction = pd.DataFrame({target: y_predict})
    output = pd.concat([df_features, df_prediction], axis = 1, sort = False)
    output.to_csv('prediction.csv', index = False)

    return output

In [193]:
predict()

Unnamed: 0.1,Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice,SalePrice.1,SalePrice.2
0,0,1,60,65.0,8450,7,5,2003,2003,196.0,...,0,0,0,0,0,2,2008,208500,208500,208500.0
1,1,2,20,80.0,9600,6,8,1976,1976,0.0,...,0,0,0,0,0,5,2007,181500,181500,181500.0
2,2,3,60,68.0,11250,7,5,2001,2002,162.0,...,0,0,0,0,0,9,2008,223500,223500,223500.0
3,3,4,70,60.0,9550,7,5,1915,1970,0.0,...,272,0,0,0,0,2,2006,140000,140000,140000.0
4,4,5,60,84.0,14260,8,5,2000,2000,350.0,...,0,0,0,0,0,12,2008,250000,250000,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1455,1456,60,62.0,7917,6,5,1999,2000,0.0,...,0,0,0,0,0,8,2007,175000,175000,175000.0
1456,1456,1457,20,85.0,13175,6,6,1978,1988,119.0,...,0,0,0,0,0,2,2010,210000,210000,210000.0
1457,1457,1458,70,66.0,9042,7,9,1941,2006,0.0,...,0,0,0,0,2500,5,2010,266500,266500,266500.0
1458,1458,1459,20,68.0,9717,5,6,1950,1996,0.0,...,112,0,0,0,0,4,2010,142125,142125,142125.0
