In [1]:
import pandas as pd
import numpy as np
import csv
import statsmodels.api as sm

df = pd.read_csv('dataset_declassify_onehot.csv', encoding='utf-8')

In [2]:
# filter(any(), dfo.columns.values)
# [ col (if col.startswith(enc) for enc in enc_columns) for col in dfo.columns.values]
include_columns = [
    'Primary Site',
    'Histologic Type 2',
    'Grade',
    'CS tumor size (2004+)',
    'CS extension (2004+)',
    'CS mets at dx (2004+)',
    'Age at diagnosis',
    'Lung - Pleural/Elastic Layer Invasion (PL) by H and E or Elastic Stain',
    'Lung - Separate Tumor Nodules - Ipsilateral Lung',
    'Lung - Surgery to Primary Site (1988-2015)',
    'Lung - Surgery to Other Regional/Distant Sites (1998+)',
    'LCCS',
    'Survival months',
    'OS'
]

onehot_encoded_columns = []
for col in df.columns.values:
    for inc in include_columns:
        if col.startswith(inc):
            onehot_encoded_columns.append(col)
            break

In [3]:
def extract_input(data=df):
    data.head()
    y = np.array(data[['LCCS']])
    X = data.drop(['LCCS', 'Survival months', 'OS'], axis=1)
    return X, y

In [6]:
def stepwise_selection(X, y, 
                       initial_list={}, 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """
    Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS

    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = dict(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included.keys()))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[list(included.keys())+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included[best_feature]=best_pval
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[list(included.keys())]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            del included[worst_feature]
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [5]:
X, y = extract_input(df)
stepwise_results = stepwise_selection(X, y)

print('resulting features:')
print(stepwise_results)

Add  CS tumor size (2004+)          with p-value 1.08985e-109
Add  CS mets at dx (2004+)=0        with p-value 1.40825e-71
Add  CS extension (2004+)=100       with p-value 3.24128e-36
Add  Grade=well                     with p-value 1.38944e-26
Add  Lung - Surgery to Primary Site (1988-2015)=None with p-value 7.31406e-19
Add  Grade=Moderately               with p-value 4.39514e-14
Add  Lung - Pleural/Elastic Layer Invasion (PL) by H and E or Elastic Stain=PL3 with p-value 1.97039e-12
Add  Age at diagnosis               with p-value 4.50768e-12
Add  Lung - Surgery to Primary Site (1988-2015)=peumonectomy with p-value 2.44321e-13
Add  Lung - Separate Tumor Nodules - Ipsilateral Lung=No separate tumor nodules noted with p-value 2.05681e-08
Add  CS mets at dx (2004+)=40       with p-value 2.66277e-09
Add  CS extension (2004+)=740       with p-value 1.6774e-05
Add  CS extension (2004+)=465       with p-value 3.78931e-05
Add  CS extension (2004+)=700       with p-value 6.35264e-05
Add  Lung 

In [6]:
def dict_to_csv(my_dict, filename='stepwise_results.csv'):
    try:
        with open(filename, 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['feature','p_value'])
            for data in my_dict.items():
                writer.writerow(data)
    except IOError as e:
        print("I/O error: {}".format(e))

In [7]:
dict_to_csv(stepwise_results)