	
# regressor.py 
This Python notebook uses a few different methods to built regressors for the data provided by Dr. Chawla's challenge assignment.  Based on how the regression functions perform according to various validation frameworks, it chooses the best regression function to predict for the data included as testing data.  It writes this in a one column format to a file called "regression_test.predictions"
### Margaret Thomann & Michael McRoskey	
#### May 7 2018

In [70]:
from sklearn import linear_model
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from IPython.display import HTML, display
import tabulate

### Defining the training, testing and prediction files

In [71]:
TRAIN_FILE = "../data/regression_train.data"
TEST_FILE = "../data/regression_test.test"
PREDICTION_FILE = "../data/regression_test.predictions"

### Reading training and testing data into data frames

In [72]:
# Reading training data into pandas data frame
features_train = ["f01", "f02", "f03", "f04", "f05", "f06", 
			"f07", "f08", "f09", "f10", "f11", "f12", 
			"f13", "f14", "f15", "f16", "f17", "f18", 
			"f19", "f20", "f21", 
			"Value"]
df_train = pd.read_csv(TRAIN_FILE, names=features_train)

# Read testing data into pandas data frame
features_test = ["f01", "f02", "f03", "f04", "f05", "f06", 
			"f07", "f08", "f09", "f10", "f11", "f12", 
			"f13", "f14", "f15", "f16", "f17", "f18", 
			"f19", "f20", "f21"]
df_test = pd.read_csv(TEST_FILE, names=features_test)

# Define x and y from data
#     x_* arrays will be transformed later in "Feature Selection"
x = df_train.drop(['Value'], axis=1)
x_a = df_train.drop(['Value'], axis=1)
x_b = df_train.drop(['Value'], axis=1)
x_c = df_train.drop(['Value'], axis=1)
y = df_train['Value']

### Feature Selection
* **a** : x_a is the non-transformed data; no feature selection has been performed 
* **b** : x_b is the transformed data using the ExtraTreesClassifier 
* **c** : x_c is the transformed data using the VarianceThreshold 

In [73]:
# Tree Based Feature Selection
clf = ExtraTreesClassifier()
clf = clf.fit(x, y)
model = SelectFromModel(clf, prefit=True)
x_b = model.transform(x)

# Low Variance Removal
# sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
# x_c = sel.fit_transform(x_c)
x_c = SelectKBest(chi2, k=19).fit_transform(x, y)


print "x_a shape:", x_a.shape
print "x_b shape:", x_b.shape
print "x_c shape:", x_c.shape

x_a shape: (5460, 21)
x_b shape: (5460, 12)
x_c shape: (5460, 19)


### Building & validating regression function

In [74]:


print "---Building Regression---" 
# Define the number of regression functions for performance metrics later
NUM_REGRESSORS = 3

# Loop through to build classifiers for each transformed x array
x_array = [x_a, x_b, x_c]
results = []
results_accuracy = []
for i in range(len(x_array)):
    
    '''Linear Regression'''
    lin_reg = linear_model.LinearRegression()
    lin_reg.fit(x_array[i], y)
    cv_results_lin_reg = cross_val_score(lin_reg, x_array[i], y, cv=10, scoring="neg_mean_squared_error")
    cv_results_lin_reg_accuracy = cross_val_score(lin_reg, x_array[i], y, cv=10)
    results.append(cv_results_lin_reg)
    results_accuracy.append(cv_results_lin_reg_accuracy)
    print "\t * Linear Regression Model built for iteration: ", i

    '''Random Forest Regression'''
    rf_reg = RandomForestRegressor()
    rf_reg.fit(x_array[i], y)
    cv_results_rf_reg = cross_val_score(rf_reg, x_array[i], y, cv=10, scoring="neg_mean_squared_error")
    cv_results_rf_reg_accuracy = cross_val_score(rf_reg, x_array[i], y, cv=10)
    results.append(cv_results_rf_reg)
    results_accuracy.append(cv_results_rf_reg_accuracy)
    print "\t * Random Forest Regression built for iteration: ", i 
    
    '''Gradient Boosting Regression'''
    gb_reg = GradientBoostingRegressor()
    gb_reg.fit(x_array[i], y)
    cv_results_gb_reg = cross_val_score(gb_reg, x_array[i], y, cv=10, scoring="neg_mean_squared_error")
    cv_results_gb_reg_accuracy = cross_val_score(gb_reg, x_array[i], y, cv=10)
    results.append(cv_results_gb_reg)
    results_accuracy.append(cv_results_gb_reg_accuracy)
    print "\t * Gradient Boosting Regression built for iteration: ", i 
    print "\n"

print "---COMPLETE:  Building Regression---"



---Building Regression---
	 * Linear Regression Model built for iteration:  0
	 * Random Forest Regression built for iteration:  0
	 * Gradient Boosting Regression built for iteration:  0


	 * Linear Regression Model built for iteration:  1
	 * Random Forest Regression built for iteration:  1
	 * Gradient Boosting Regression built for iteration:  1


	 * Linear Regression Model built for iteration:  2
	 * Random Forest Regression built for iteration:  2
	 * Gradient Boosting Regression built for iteration:  2


---COMPLETE:  Building Regression---


### Regression performance metrics

In [75]:
print("---Performance Metrics with Neg_Mean_Squared_Error---")
table = []
table.append(["", "Linear Regression", "Random Forest Regression", "Gradient Boosting Regression"])
column_names = ["a. No Feature Selection", "b. Extra Trees Feature Selection", "c. Select 19 Best Feature Selection"]
table_row = []
col_counter = 0
for i in range(len(x_array)*NUM_REGRESSORS):
    mean = results[i].mean()
    std = results[i].std()
    str_results = "%0.2f (+/- %0.2f)" % (mean, std * 2)
    table_row.append(str_results)
    if ((i+1) % 3 == 0):
        table_row.insert(0, column_names[col_counter])
        table.append(table_row)
        table_row = []
        col_counter += 1

display(HTML(tabulate.tabulate(table, tablefmt='html')))

print("---Performance Metrics with Accuracy---")
table = []
table.append(["", "Linear Regression", "Random Forest Regression", "Gradient Boosting Regression"])
column_names = ["a. No Feature Selection", "b. Extra Trees Feature Selection", "c. Select 19 Best Feature Selection"]
table_row = []
col_counter = 0
for i in range(len(x_array)*NUM_REGRESSORS):
    mean = results_accuracy[i].mean()
    std = results_accuracy[i].std()
    str_results = "%0.2f (+/- %0.2f)" % (mean, std * 2)
    table_row.append(str_results)
    if ((i+1) % 3 == 0):
        table_row.insert(0, column_names[col_counter])
        table.append(table_row)
        table_row = []
        col_counter += 1

display(HTML(tabulate.tabulate(table, tablefmt='html')))

---Performance Metrics with Neg_Mean_Squared_Error---


0,1,2,3
,Linear Regression,Random Forest Regression,Gradient Boosting Regression
a. No Feature Selection,-91.17 (+/- 48.96),-6.82 (+/- 1.47),-5.99 (+/- 2.57)
b. Extra Trees Feature Selection,-95.08 (+/- 51.27),-7.02 (+/- 1.11),-6.68 (+/- 2.69)
c. Select 19 Best Feature Selection,-91.54 (+/- 48.09),-6.85 (+/- 2.36),-6.13 (+/- 2.88)


---Performance Metrics with Accuracy---


0,1,2,3
,Linear Regression,Random Forest Regression,Gradient Boosting Regression
a. No Feature Selection,0.72 (+/- 0.12),0.98 (+/- 0.01),0.98 (+/- 0.01)
b. Extra Trees Feature Selection,0.71 (+/- 0.12),0.98 (+/- 0.01),0.98 (+/- 0.01)
c. Select 19 Best Feature Selection,0.72 (+/- 0.11),0.98 (+/- 0.01),0.98 (+/- 0.01)


### Predicting using regression function on test data
It is clear from the tables above that Gradient Boosting Regression with no feature selection performs the best.

In [76]:
x_train = df_train.drop(['Value'], axis=1)
y_train = df_train['Value']

# Fit Model
final_regressor = GradientBoostingRegressor()
final_regressor.fit(x_train, y_train)

# Predict model
predictions = final_regressor.predict(df_test)
with open(PREDICTION_FILE, "w+") as f:
    for number in predictions:
        f.write(str(number)+"\n") 