## LassoFeatureSelection

In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, log_loss
from sklearn.linear_model import SGDRegressor, Lasso
from sklearn.preprocessing import StandardScaler, scale
from sklearn.model_selection import RepeatedKFold
from sklearn.decomposition import PCA
import sys
sys.path.append("../src/")
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from pandas import DataFrame, Series
from sklearn.metrics import mean_squared_error, r2_score
from numpy import sqrt, mean

##### Read in data and one-hot-encode int dtype variables

In [98]:
# Load data
df = pd.read_csv("../data/Series3_6.15.17_padel.csv", index_col=0)
# Eliminate features without variance
df = df.loc[:, (df.std() > 0).values]
# Seperate Series 3 test when IC50 is null
test_index = df.IC50.isnull()
test_df = df.loc[test_index]
df = df.loc[~test_index]
# Remove columns with missing data
df = df.dropna(axis=1)
# Transform discrete with one-hot-encoding
int_cols = df.columns[df.dtypes == 'int64']
float_cols = df.columns[df.dtypes == 'float64']
one_hot_df = pd.get_dummies(df[int_cols].astype('O'))
df = pd.merge(df[float_cols], one_hot_df, left_index=True, right_index=True)
# Split x, y
y_data = df.pop("IC50")
x_data = df.copy()
columns = x_data.columns.values
# Ensure no (+/-) inf or nan due to improper transformation
x_data.replace([np.inf, -np.inf], np.nan, inplace=True)
assert not sum(x_data.isna().sum()), "Unexpected nulls found"

##### Perform non-linear transformations to Series 3

# Perform feature engineering on float columns
for feat in x_data.columns[x_data.dtypes == 'float64']:
    feature_df = x_data.loc[:, feat]
    if feature_df.min() > 0:  # Avoid 0 or negative
        x_data.loc[:, feat + "_log"] = feature_df.apply(np.log)  # log
        x_data.loc[:, feat + "_log2"] = feature_df.apply(np.log2)  # log2
        x_data.loc[:, feat + "_log10"] = feature_df.apply(np.log10)  # log10
        x_data.loc[:, feat + "_cubert"] = feature_df.apply(
            lambda x: np.power(x, 1 / 3))  # cube root
        x_data.loc[:, feat + "_sqrt"] = feature_df.apply(np.sqrt)  # square root
    # Avoid extremely large values, keep around 1M max
    if feature_df.max() < 13:
        x_data.loc[:, feat + "_exp"] = feature_df.apply(np.exp)  # exp
    if feature_df.max() < 20:
        x_data.loc[:, feat + "_exp2"] = feature_df.apply(np.exp2)  # exp2
    if feature_df.max() < 100:
        x_data.loc[:, feat + "_cube"] = feature_df.apply(
            lambda x: np.power(x, 3))  # cube
    if feature_df.max() < 1000:
        x_data.loc[:, feat + "_sq"] = feature_df.apply(np.square)  # square

##### Normalize data

In [99]:
x_scaler = StandardScaler()
y_scaler = StandardScaler()
x_data = x_scaler.fit_transform(x_data)
x_data =pd.DataFrame(x_data)
y_data.loc[:] = np.squeeze(y_scaler.fit_transform(y_data.values.reshape(-1, 1)))

In [147]:
lasso = Lasso(alpha=0.15)
from sklearn.feature_selection import SelectFromModel
select_from_lasso = SelectFromModel(lasso)
select_from_lasso.fit(x_data, y_data)
n_features = select_from_lasso.transform(x_data)


[[-1.90255391  0.69200132  0.1175471  ... -0.21081851 -0.26111648
  -0.21081851]
 [ 0.77015275  2.56513217 -1.22197365 ...  4.74341649 -0.26111648
  -0.21081851]
 [ 0.50327655 -0.57088223 -1.69089872 ... -0.21081851 -0.26111648
  -0.21081851]
 ...
 [-0.252076   -0.31534062 -0.05148966 ... -0.21081851 -0.26111648
  -0.21081851]
 [-0.74967585 -0.1591759  -0.30767016 ... -0.21081851 -0.26111648
  -0.21081851]
 [-0.84699787 -0.69564664 -0.92627181 ... -0.21081851 -0.26111648
  -0.21081851]]


In [148]:
coefficients_lasso = pd.concat([pd.Series(columns),pd.Series(np.transpose(select_features_model.coef_))], axis = 1)
print(coefficients_lasso.values)
selected_features_lasso = []
for index, row in coefficients_lasso.iterrows():
    if row[1]!=0:
        selected_features_lasso.append(row)     
#selected_features_lasso = set([x[0] for x in coefficients_lasso.values if x[1] != 0])  
if selected_features_lasso[0] in x_data.columns:
    print('yes')
#x_data = x_data.loc[:, selected_features_lasso[0]].copy()


[['apol' 0.0]
 ['ATS0m' -0.0]
 ['ATS1m' 0.0]
 ...
 ['Zagreb_156' -0.0]
 ['Zagreb_180' -0.0]
 ['Zagreb_182' -0.0]]


TypeError: 'Series' objects are mutable, thus they cannot be hashed

def variance_scorer(x, y):
    """    
    Get the variance for each column of X.
    
    Because principal components have decreasing variance
    (i.e. PC4 has less variance than PC3 which has less variance
    than PC2 etc.), we can use this function in SelectKBest to select
    only the top X number of principal components.
    
    """
    scores = [np.var(column) for column in x.T]
    return scores, np.array([np.NaN]*len(scores))

In [17]:
model = Pipeline(steps=[
            ('feature_selection', SelectFromModel(Lasso(alpha=0.15))),
            ('regress', SGDRegressor(random_state=0))
        ])

model.set_params(regress__loss='huber', regress__penalty='l1', 
                 regress__alpha=0.15, regress__l1_ratio=0.30, regress__max_iter=10)
model.fit(x_data,y_data)

Pipeline(memory=None,
     steps=[('select', SelectKBest(k=5, score_func=<function variance_scorer at 0x7feb3d71b158>)), ('regress', SGDRegressor(alpha=0.15, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.3, learning_rate='invscaling',
       loss='huber', max_iter=10, n_iter=None, penalty='l1', power_t=0.25,
       random_state=0, shuffle=True, tol=None, verbose=0, warm_start=False))])

##### Validate Model Performance

In [78]:
def score_regressor(x_data, y_data, model, add_train_data=None, verbose=1, pos_split=10):
    """
    Model validation for producing comparable model evaluation. Uses Stratified K-Fold LOOCV adapted
    for regression with the positive equivalent <10 IC50, producing 5 folds.
    :param x_data: Pandas DataFrame object, Series 3 with 47 examples
    :param y_data: Pandas DataFrame or Series object, float datatype, target variables for Series 3
    :param model: must have fit and predict method, use sklearn or wrapper
    :param add_train_data: Additional data to be evenly spread across train splits
    :param verbose: If 0, return dictionary only, if 1 printed results
    :param pos_split: cutoff for positive class in StratifiedKFold (y<pos_split)
    :return: dictionary
    """
    assert isinstance(x_data, DataFrame), "x_data must be a pandas DataFrame"
    assert isinstance(y_data, DataFrame) or isinstance(y_data, Series), "y_data must be pandas DataFrame or Series"
    assert y_data.dtypes == "float", "Expected y_data to be float dtype and received {}".format(y_data.dtypes)

    if add_train_data is not None:
        raise NotImplementedError

    # create logging dictionary to track scores
    scoring_dict = {"r2_score": [], "rmse": []}
    # create y_class series for Stratified K-Fold split at pos_split
    y_class = Series(data=[int(y < pos_split) for y in y_data])
    # num_splits count number of positive examples
    num_splits = sum(y_class.values)
    scoring_dict["num_splits"] = num_splits
    # create splits using stratified kfold
    rskf = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=10, random_state=36851234)
    # loop through splits
    for train, test in rskf.split(x_data, y_class):
        x_train, x_test = x_data.iloc[train, :], x_data.iloc[test, :]
        y_train, y_test = y_data.iloc[train], y_data.iloc[test]
        # train model, test model with all scoring parameters
        model.fit(x_train, y_train)
        y_ = model.predict(x_test)
        # append scores to logging dictionary
        scoring_dict["r2_score"].append(r2_score(y_test, y_))
        scoring_dict["rmse"].append(sqrt(mean_squared_error(y_test, y_)))
    if verbose == 1:
        # Print contents of dictionary except confusion matrix
        print("with {} splits and {} repeats".format(num_splits, 10))
        for metric in scoring_dict:
            if metric == "num_splits":
                continue
            else:
                print("average {}: {}".format(metric, mean(scoring_dict[metric])))
    return scoring_dict


In [79]:
results = score_regressor(x_data= x_data_pca, y_data = y_data, model = grid.best_estimator_)
                                   

with 5 splits and 10 repeats
average r2_score: -0.20301890116385365
average rmse: 29.567772540388663
