# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

# Load Data

In [None]:
#train = pd.read_csv('/home/garv/MachineLearning/IRage/train_modified.csv')
#train = pd.read_csv('/home/garv/MachineLearning/IRage/RMSE/train_modified_cut1.csv')
train = pd.read_csv('/home/IRage/RMSE/train_modified_cut1.csv')
#test = pd.read_csv('/home/garv/MachineLearning/IRage/test_modified.csv')
#test = pd.read_csv('/home/garv/MachineLearning/IRage/RMSE/test_modified.csv')
test = pd.read_csv('/home/IRage/RMSE/test_modified_cut1.csv')


In [None]:
train.dtypes

In [None]:
train.shape, test.shape

In [None]:
target = 'Best Buy Price Modified'
Timestamp = 'Exchange timestamp in milliseconds'

## Define a function for modeling and cross-validation

This function will do the following:
1. fit the model
2. determine training accuracy
3. determine training AUC
4. determine testing AUC
5. update n_estimators with cv function of xgboost package
6. plot Feature Importance 

In [None]:
#test_results = pd.read_csv('test_results.csv')
#test_results = pd.read_csv('/home/garv/MachineLearning/IRage/test_modified.csv')
#test_results = pd.read_csv('/home/garv/MachineLearning/IRage/RMSE/test_modified.csv')
test_results = pd.read_csv('/home/IRage/RMSE/test_modified_cut1.csv')
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        xgtest = xgb.DMatrix(dtest[predictors].values)
        #cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        #cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='rmse', early_stopping_rounds=early_stopping_rounds,verbose_eval=False)

        #cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,metrics=["auc"], early_stopping_rounds=early_stopping_rounds, show_progress=False)    
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    #alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
    #alg.fit(dtrain[predictors], dtrain['Best Buy Price Modified'],eval_metric='auc')
    alg.fit(dtrain[predictors], dtrain['Best Buy Price Modified'],eval_metric='rmse')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    #dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] #Commented
        
    #Print model report:
    #print "\nModel Report"
    print("\nModel Report")
    #print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    #print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Best Buy Price Modified'].values, dtrain_predictions))
    #print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
    #print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Best Buy Price Modified'], dtrain_predprob))
    print("Explained Variance Score (Train): %f" % metrics.explained_variance_score(dtrain['Best Buy Price Modified'], dtrain_predictions))
    print("Mean Absolute Error (Train): %f" % metrics.mean_absolute_error(dtrain['Best Buy Price Modified'], dtrain_predictions))
    print("Mean Squared Error (Train): %f" % metrics.mean_squared_error(dtrain['Best Buy Price Modified'], dtrain_predictions))
    print("R2 Score (Train): %f" % metrics.r2_score(dtrain['Best Buy Price Modified'], dtrain_predictions))
    
#     Predict on testing data:
    #dtest['predprob'] = alg.predict_proba(dtest[predictors])[:,1]      #Commented
    dtest['pred'] = alg.predict(dtest[predictors]) 
    #results = test_results.merge(dtest[['ID','predprob']], on='ID')    #Commented
    results = test_results.merge(dtest[['Exchange timestamp in milliseconds','pred']], on='Exchange timestamp in milliseconds')    #Commented
    #print 'AUC Score (Test): %f' % metrics.roc_auc_score(results['Disbursed'], results['predprob']) #Commented
    #print 'AUC Score (Test): %f' % metrics.roc_auc_score(results['Best Buy Price Modified'], results['pred']) #Commented
    #print('AUC Score (Test): %f' % metrics.roc_auc_score(results['Best Buy Price Modified'], results['pred'])) 
    print("Explained Variance Score (Test): %f" % metrics.explained_variance_score(results['Best Buy Price Modified'], results['pred']))
    print("Mean Absolute Error (Test): %f" % metrics.mean_absolute_error(results['Best Buy Price Modified'], results['pred']))
    print("Mean Squared Error (Test): %f" % metrics.mean_squared_error(results['Best Buy Price Modified'], results['pred']))
    print("R2 Score (Test): %f" % metrics.r2_score(results['Best Buy Price Modified'], results['pred']))
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.savefig("FeatureImportance.png", bbox_inches='tight') #appended

In [None]:
#predictors = [x for x in train.columns if x not in [target, IDcol]]
predictors = [x for x in train.columns if x not in [target]]
xgb1 = XGBRegressor(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'reg:linear',
        scale_pos_weight=0,
        seed=27)
modelfit(xgb1, train, test, predictors)