In [1]:
from sklearn.feature_selection import RFE

In [2]:
# Recursive Feature Elimination
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load the iris datasets
dataset = datasets.load_iris()
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 3 attributes
rfe = RFE(model, 3)
rfe = rfe.fit(dataset.data, dataset.target)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True]
[2 1 1 1]


In [11]:
import pandas as pd
import numpy as np
import time
import os  # Used to create folders
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from shutil import copyfile	 # Used to copy parameters file to directory
from sklearn.utils import resample
from read_parameter_file import get_parameters
from sklearn.feature_selection import RFE
# from xgboost import xgboost


parameters = "../../../Data/parameters.txt"	 # Parameters file
d = get_parameters(parameters)


if d["user"] == "Kieron":
    if d["specify_subfolder"] == "n":
        newpath = r"../0. Results/" + d["user"] + "/model/" + time.strftime("%Y.%m.%d/") + time.strftime("%H.%M.%S/")# Log file location
    else:
        newpath = r"../0. Results/" + d["user"] + "/model/" + d["specify_subfolder"] + time.strftime("/%Y.%m.%d/") + \
                  time.strftime("%H.%M.%S/")  # Log file location
else:
    newpath = r"../0. Results/" + d["user"] + "/model/" + time.strftime("%Y.%m.%d/")  # Log file location
if not os.path.exists(newpath):
    os.makedirs(newpath)  # Make folder for storing results if it does not exist

np.random.seed(int(d["seed"]))	# Set seed

if d["user"] == "Kieron":
    df = pd.read_csv(d["file_location"] + d["file_name"] + ".csv", encoding='latin-1', low_memory=False)
else:
    df = pd.read_csv(d["file_location"] + "vw_Incident_cleaned" + d["file_name"] + ".csv", encoding='latin-1',
                     low_memory=False)

if d["resample"] == "y":
        df = resample(df, n_samples=int(d["n_samples"]), random_state=int(d["seed"]))

X = df.drop("TimeTaken", axis=1)
y = df["TimeTaken"]

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=int(d["seed"]))


regressors = []
alg_names = []

if d["LinearRegression"] == "y":
    regressors.append(LinearRegression())
#     parameters_to_tune.append({
#         "fit_intercept":[True, False],
#         "normalize":[True, False]})
    alg_names.append("LinearRegression")
if d["ElasticNet"] == "y":
    regressors.append(ElasticNet())
#     parameters_to_tune.append({
#         # "alpha":[0.0001, 0.001, 0.01, 0.1, 1.0, 10, 100], # get convergence warning for small alphas
#         "alpha": [0.01, 0.01, 0.1, 1.0, 10, 100],
#         "l1_ratio":[.1, .5, .7, .9, .95, .99, 1],
#         "max_iter":[10000, 100000],
#          # "tol": [0.00001, 0.0001],
#          # "warm_start":[True, False]}
#     })
    alg_names.append("ElasticNet")
if d["KernelRidge"] == "y":
    regressors.append(KernelRidge(kernel='rbf', gamma=0.1))
#     parameters_to_tune.append({"alpha": [1e0, 0.1, 1e-2, 1e-3],
#                               "gamma": np.logspace(-2, 2, 5)})
    alg_names.append("KernelRidge")
# if d["xgboost"] == "y":
# 	regressors.append(GridSearchCV(xgboost())
# 	alg_names.append("xgboost")
if d["RandomForestRegressor"] == "y":
    regressors.append(RandomForestRegressor())
#     parameters_to_tune.append({
#         "n_estimators":[100, 250, 500, 1000],
#         "criterion":["mse", "mae"],
#         "max_features":[1, 0.1, "auto", "sqrt", "log2", None],
#         "max_depth":[None, 10, 25, 50]})
    alg_names.append("RandomForestRegressor")

if d["top_k_features"] == "y":
    k = None
else:
    k = int(d["top_k_features"])
if d["step"] == "y":
    step = 1
else:
    step = int(d["step"])
    
for regressor, alg_name in zip(regressors, alg_names):
    print("# %s Recursive Feature Elimination for top k = %s features (step = %s):\n" % (alg_name, k, step))
    regr = RFE(regressor, n_features_to_select=k, step=step)
    regr.fit(X_train, y_train.values.ravel())
    
    supports = regr.support_
    rankings = regr.ranking_
    
    kX_train = pd.Dataframe()
    for i, (ranking, support) in enumerate(zip(rankings, supports)):
        print(X_train.columns[i], ranking, support)
    
    print(regr.score(X_train, y_train))
    print(regr.score(X_test, y_test))
    
    
    

#     print(regr.score(y_test))
    
#     print("Best parameters set found on development set:")
#     print("\t", regr.best_params_)

#     best_train = regr.cv_results_["mean_train_score"][regr.best_index_]
#     best_train_std = regr.cv_results_["std_train_score"][regr.best_index_]
#     best_test = regr.cv_results_["mean_test_score"][regr.best_index_]
#     best_test_std = regr.cv_results_["std_test_score"][regr.best_index_]
#     print("\n\t R2 Train: %0.5f (+/- %0.05f)" % (best_train, best_train_std * 2))
#     print("\t R2 Test: %0.5f (+/- %0.05f)" % (best_test, best_test_std * 2))

#     y_train_pred = regr.predict(X_train)
#     mse_train = mean_squared_error(y_train, y_train_pred)
#     y_test_pred = regr.predict(X_test)
#     mse_test = mean_squared_error(y_test, y_test_pred)
#     print("\t RMSE Train: %s" % mse_train)
#     print("\t RMSE Test: %s" % mse_test)

#     with open(newpath + "best_params_%s" % alg_name + time.strftime("%H.%M.%S.txt"), "w") as f:
#         f.write("Best parameters set found on development set:\n")
#         f.write("\t"+str(regr.best_params_) + "\n")
#         f.write("\tTrain: %0.5f (+/- %0.05f)\n" % (best_train, best_train_std * 2))
#         f.write("\tTest: %0.5f (+/- %0.05f)\n" % (best_test, best_test_std * 2))
#         f.write("\t RMSE Train: %s\n" % mse_train)
#         f.write("\t RMSE Test: %s\n" % mse_test)

#         print("\nGrid R2 scores on development set:")
#         f.write("\nGrid R2 scores on development set:\n")
#         means = regr.cv_results_['mean_test_score']
#         stds = regr.cv_results_['std_test_score']
#         for mean, std, params in zip(means, stds, regr.cv_results_['params']):
#             f.write("\t%0.5f (+/-%0.05f) for %r\n" % (mean, std * 2, params))
#             print("\t%0.5f (+/-%0.05f) for %r" % (mean, std * 2, params))

# copyfile(parameters, newpath + "/" + time.strftime("%H.%M.%S") + "_parameters.txt")  # Save parameters


# LinearRegression Recursive Feature Elimination for top k = None features (step = 1):

Reason_3PP - Identify Payment 1 True
Reason_3PP - Other 1 True
Reason_3PP - Payment Status 1 True
Reason_3PP - Pending Payment 1 True
Reason_3rd Party Hold 1 True
Reason_A - Missing information from customer - Confirmation 1 True
Reason_A - Missing information from customer - Missing information 1 True
Reason_APOC - Waiting for document review 1 True
Reason_Affiliates Section Error 1 True
Reason_Agreement  Number Error 1 True
Reason_Agreement - Bill To/Ship To Participant 1 True
Reason_Agreement - Compliance 1 True
Reason_Agreement - Paperwork Validation 1 True
Reason_Agreement /Enrollment Number Error 1 True
Reason_Agreement/Enrollment Number Error 1 True
Reason_Agreement/Enrollment Number Issue 1 True
Reason_Agreement/Enrollment Number/Date Issue 1 True
Reason_Agreement/Enrollment Number/Name  Issue 1 True
Reason_Amendment - Amendment Description 1 True
Reason_Amendment - Other 1 True
Reason_Amend

#### want to be able to call a function in model.py that takes in the regressor, runs it once with RFE and then runs it again with k featuress

In [20]:
from sklearn.feature_selection import RFE
def RFE_modeling(regressor, k, step,d, X_train, y_train, X_test, y_test):
    
    if d["top_k_features"] == "y":
        k = None
    else:
        k = int(d["top_k_features"])
    if d["step"] == "y":
        step = 1
    else:
        step = int(d["step"])    
    
    regr = RFE(regressor, n_features_to_select=k, step=step)
    regr.fit(X_train, y_train.values.ravel())
    
    print("train score (all features): ", regr.score(X_train, y_train))
    print("test score (all features): ", regr.score(X_test, y_test), "\n")
    
    supports = regr.support_
    rankings = regr.ranking_
    
    kX_train = pd.DataFrame()
    kX_test = pd.DataFrame()
    for i, (ranking, support) in enumerate(zip(rankings, supports)):
        print("Used: %s, Ranking: %s, Column name: %s"% (support, ranking, X_train.columns[i]))
        if support == True:
            kX_train[X_train.columns[i]] = X_train[X_train.columns[i]]
            kX_test[X_train.columns[i]] = X_test[X_train.columns[i]]
    
    regressor.fit(kX_train, y_train.values.ravel())
    
    if k is None:
        print("\ntrain score (k = half features and step = %s): " % step, regressor.score(kX_train, y_train))
        print("test score: (k = half features and step = %s): " % step, regressor.score(kX_test, y_test))
    else:
        print("\ntrain score (k = %s features and step = %s): " % (k, step), regressor.score(kX_train, y_train))
        print("test score: (k = %s features and step = %s): " % (k, step), regressor.score(kX_test, y_test))
    return kX_train, kX_test
    
RFE_modeling(LinearRegression(), k, step, d, X_train, y_train, X_test, y_test)

train score (all features):  0.991741710103
test score (all features):  -4.11291971115 

Used: True, Ranking: 1, Column name: Reason_3PP - Identify Payment
Used: True, Ranking: 1, Column name: Reason_3PP - Other
Used: True, Ranking: 1, Column name: Reason_3PP - Payment Status
Used: True, Ranking: 1, Column name: Reason_3PP - Pending Payment
Used: True, Ranking: 1, Column name: Reason_3rd Party Hold
Used: True, Ranking: 1, Column name: Reason_A - Missing information from customer - Confirmation
Used: True, Ranking: 1, Column name: Reason_A - Missing information from customer - Missing information
Used: True, Ranking: 1, Column name: Reason_APOC - Waiting for document review
Used: True, Ranking: 1, Column name: Reason_Affiliates Section Error
Used: True, Ranking: 1, Column name: Reason_Agreement  Number Error
Used: True, Ranking: 1, Column name: Reason_Agreement - Bill To/Ship To Participant
Used: True, Ranking: 1, Column name: Reason_Agreement - Compliance
Used: True, Ranking: 1, Column