# Feature Assessment 

In [None]:
from scipy.stats.stats import pearsonr

import numpy as np
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
np.set_printoptions(precision=4, suppress=True)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

First, we import the dataset we made in our previous notebook.

In [None]:
df = pd.read_csv('/Users/desert/desert_workspace/desert_data/clean_data.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.fillna(0)
df.head()

In [None]:
df.rename(columns={'Influenza Death (<65 years of age)':'senior_flu_deaths'}, inplace=True)
df.rename(columns={'Varicella Hospitalizations':'varicella_hospitalizations'}, inplace=True)

In [None]:
cols=df.columns.tolist()
cols = [col for col in cols if col != 'County']
len(cols)

In [None]:
df[['pop2010_in_des','des_percent']].describe()
df['at_risk'] = (df['pop2010_in_des'] >= 21217) & (df['des_percent']>=0.079662)
df['at_risk'] =df['at_risk'].astype(int)

In [None]:
import scipy.stats as stats
def draw_histograms(frame, variables, n_rows, n_cols):
    fig=plt.figure(figsize=(16, 12))
    for i, var_name in enumerate(variables):
        kde = stats.gaussian_kde(frame[var_name])
        xx = np.linspace(np.min(frame[var_name]), np.max(frame[var_name]), 1000)
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        frame[var_name].hist(bins=10,ax=ax, normed=True, alpha=0.3)
        ax.plot(xx, kde(xx))
        ax.set_title(var_name+" Distribution")
    fig.tight_layout() 
    plt.show()

draw_histograms(df, cols[:20], 5, 4)

In [None]:
draw_histograms(df, cols[20:40], 5, 4)

In [None]:
draw_histograms(df, cols[40:60], 5, 4)

In [None]:
draw_histograms(df, cols[60:], 5, 3)

## Testing for Normality
normaltest returns a 2-tuple of the chi-squared statistic, and the associated p-value. Given the null hypothesis that x came from a normal distribution, the p-value represents the probability that a chi-squared statistic that large (or larger) would be seen.

If the p-val is very small, it means it is unlikely that the data came from a normal distribution. (http://stackoverflow.com/questions/12838993/scipy-normaltest-how-is-it-used)

In [None]:
# xcols=X.columns.tolist()
# xcols[0]

def assess_normality_of_feats(frame, var):
    return stats.normaltest(frame[var])[-1] >= 0.05

def obtain_normal_feats(frame, var_lst):
    norm = []
    for i in var_lst:
        #print i, assess_normality_of_feats(frame, i)
        if assess_normality_of_feats(frame, i) == True:
            norm += [i]
    return norm
    
#norm_no_mc = ['Chlamydia','PCT_65OVER','PCT_HSPNC','PCT_OTHER','PCT_UNDER5','PCT_UNDR18','PCT_WHITE','cnty_dm_pct_adj','cnty_inactive_pct_adj','opiods_rx_1000','readm_30_cabg']

norm_cols = obtain_normal_feats(frame=df, var_lst=cols)
norm_cols

In [None]:
from sklearn.preprocessing import StandardScaler
observations = len(df)
cols = [col for col in cols if col != 'County']
target = 'n_food_des' 
exclude = ['County','n_food_des','rural_des','urban_des','pop2010_in_des','LILATracts_1And10','des_percent','cnty_obesity_pct','cnty_inactive_pct','cnty_dm_pct', 'high_food_des_prev']
predictors = [column for column in cols if column not in exclude]
X = df[predictors]
y  = df[target].values

standardization = StandardScaler()
Xst = standardization.fit_transform(X)
original_means = standardization.mean_
originanal_stds = standardization.std_
Xst = np.column_stack((Xst,np.ones(observations)))

In [None]:
import random

def random_w( p ):
    return np.array([np.random.normal() for j in range(p)])

def hypothesis(X,w):
    return np.dot(X,w)

def loss(X,w,y):
    return hypothesis(X,w) - y

def squared_loss(X,w,y):
    return loss(X,w,y)**2

def gradient(X,w,y):
    gradients = list()
    n = float(len( y ))
    for j in range(len(w)):
        gradients.append(np.sum(loss(X,w,y) * X[:,j]) / n)
    return gradients

def update(X,w,y, alpha=0.01):
    return [t - alpha*g for t, g in zip(w, gradient(X,w,y))]
    
def optimize(X,y, alpha=0.01, eta = 10**-12, iterations = 1000):
    w = random_w(X.shape[1])
    path = list()
    for k in range(iterations):
        SSL = np.sum(squared_loss(X,w,y))
        new_w = update(X,w,y, alpha=alpha)
        new_SSL = np.sum(squared_loss(X,new_w,y))
        w = new_w
        if k>=5 and (new_SSL - SSL <= eta and new_SSL - SSL >= -eta):
            path.append(new_SSL)
            return w, path
        if k % (iterations / 20) == 0:
            path.append(new_SSL)
    return w, path

alpha = 0.02
w, path = optimize(Xst, y, alpha, eta = 10**-12, iterations = 20000)
print ("These are our final standardized coefficients: " + ', '.join(map(lambda x: "%0.4f" % x, w)))



### Unstandardizing coefficients:

In [None]:
unstandardized_betas = w[:-1] / originanal_stds
unstandardized_bias  = w[-1]-np.sum((original_means / originanal_stds) * w[:-1])
print ('%8s: %8.4f' % ('bias', unstandardized_bias))
variables = X.columns.tolist()
for beta,varname in zip(unstandardized_betas, variables):
    print ('%8s: %8.4f' % (varname, beta))

### Assessing feature importance using standardiazed coefficients

#### First let's look at the coefficients obtained without standardization below

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

all_vars = predictors.append(target)

predictors = X.columns.tolist()

dataset = df[predictors]#[all_vars]
linear_regression = LinearRegression(normalize=False,fit_intercept=True)
standardization = StandardScaler()
Stand_coef_linear_reg = make_pipeline(standardization,linear_regression)
linear_regression.fit(X,y)
for coef, var in sorted(zip(map(abs,linear_regression.coef_), dataset.columns[:-1]), reverse=True):
    print ("%6.3f %s" % (coef,var))

#### Now let's look at the coefficients following standardization:

"Having all the predictors on a similar scale now, we can easily provide a more realistic interpretation of each coefcient. Clearly, it appears that a unit change has more impact when it involves the variables POP2010, Adolescent_births, PCT_UNDER5, OHU2010, readm_30_stk, num_urban, FFR12, PCT_UNDR18, n_hospitals, PCT_65OVER, PCT_18_64, readm_30_copd, and NUMGQTRS. The order of the features below show their relevancy when standardized for predicting the number of food deserts present within a county" (Linear Regression, p.83).

In [None]:
Stand_coef_linear_reg.fit(X,y)
for coef, var in sorted(zip(map(abs,Stand_coef_linear_reg.steps[1][1].coef_), dataset.columns[:-1]), reverse=True):
    print ("%6.3f %s" % (coef,var))

In [None]:
from sklearn.linear_model import lars_path

print("Computing regularization path using the LARS ...")
alphas, _, coefs = lars_path(X.values, y, method='lasso', verbose=True)

xx = np.sum(np.abs(coefs.T), axis=1)
xx /= xx[-1]

plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef| / max|coef|')
plt.ylabel('Coefficients')
plt.title('LASSO Path')
plt.axis('tight')
plt.show()

## Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(dataset)#.values)
names = dataset.columns.tolist()
  
lasso = Lasso(alpha=.3)
lasso.fit(X, y)

#A helper method for pretty-printing linear models
def pretty_print_linear(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return "\n + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)
  
print "Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True)


In [None]:
#good predictors
sns.lmplot(x='n_rural', y=target, data=df)
sns.lmplot(x='HUNVFlag', y=target, data=df)
sns.lmplot(x='HIV', y=target, data=df)
sns.lmplot(x='PCT_BLACK', y=target, data=df)
sns.lmplot(x='PCT_UNDER5', y=target, data=df)
sns.lmplot(x='HIV', y=target, data=df)

# bad predictor
sns.lmplot(x='FFR12', y=target, data=df)


In [None]:
pd.DataFrame([xx, coefs.T])
coefs
X

In [None]:
from sklearn.metrics import r2_score
linear_regression = LinearRegression(normalize=False,fit_intercept=True)

def r2_est(X,y):
    return r2_score(y,linear_regression.fit(X,y).predict(X))

print ('Baseline R2: %0.3f' %  r2_est(X,y))


r2_impact = list()
for j in range(X.shape[1]):
    selection = [i for i in range(X.shape[1]) if i!=j]
    r2_impact.append(((r2_est(X,y) - \
    r2_est(X.values [:,selection],y)) ,dataset.columns[j]))
for imp, varname in sorted(r2_impact, reverse=True):
    print ('%6.3f %s' %  (imp, varname))

Let's take another look at our dataset by calculating some summary statistics, and making yet another correlation matrix.

In [None]:
df.describe()

In [None]:
# calculate the correlation matrix
corr_dataframe = df.corr()

# compute hierarchical cluster on both rows and columns for correlation matrix and plot heatmap 
def corr_heatmap(corr_dataframe):
    import scipy.cluster.hierarchy as sch
    
    corr_matrix = np.array(corr_dataframe)
    col_names = corr_dataframe.columns
    
    Y = sch.linkage(corr_matrix, 'single', 'correlation')
    Z = sch.dendrogram(Y, color_threshold=0, no_plot=True)['leaves']
    corr_matrix = corr_matrix[Z, :]
    corr_matrix = corr_matrix[:, Z]
    col_names = col_names[Z]
    im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto', cmap='bwr')
    plt.colorbar()
    plt.xticks(range(corr_matrix.shape[0]), col_names, rotation='vertical', fontsize=4)
    plt.yticks(range(corr_matrix.shape[0]), col_names[::-1], fontsize=4)
    
# plot
corr_heatmap(corr_dataframe)

In [None]:
def remove_high_corr(corr_dataframe, thresh = 0.9):
    '''remove predictors with high pairwise correlation'''
    abs_corr = np.abs(corr_dataframe).as_matrix() # absolute correlation matrix
    col_names = list(corr_dataframe.columns)
    
    # set up diagonal to 0
    np.fill_diagonal(abs_corr, 0)
    
    print "Removed predictors (in order): \n"
    while np.max(abs_corr) >= thresh:
        i, j = np.unravel_index(abs_corr.argmax(), abs_corr.shape) # find maximum element
        # print abs_corr[i, j]
        rdx = which_to_remove(i, j, abs_corr)
        # remove corresponding predictor
        print col_names.pop(rdx)
        abs_corr = np.delete(abs_corr, rdx, 0)
        abs_corr = np.delete(abs_corr, rdx, 1)
        
    return col_names

def which_to_remove(i, j, abs_corr):
    '''compare two predictors and remove the one with higher abs correlation with other predictors'''
    i_absmean = np.mean(abs_corr[i, np.where(abs_corr[i,:] == 0)])
    j_absmean = np.mean(abs_corr[j, np.where(abs_corr[j,:] == 0)])
    
    return i if i_absmean > j_absmean else j

# remained predictors
col_remained = remove_high_corr(corr_dataframe)
data=df[col_remained]
corr_dataframe = data.corr()
corr_heatmap(corr_dataframe)

The correlation matrix below shows the dataset WITHOUT the highly correlated features identified above.

In [None]:
corrmat = corr_dataframe

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))

# Draw the heatmap using seaborn, and add a title to the plot
sns.heatmap(corrmat, vmax=.8, square=True)
ax.set_title('CA Food Desert Data Correlations')
f.tight_layout()

Whereas this correlation matrix shows the dataset with all original features (WITH the highly correlated features identified above). 

In [None]:
corrmat = df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))

# Draw the heatmap using seaborn, and add a title to the plot
sns.heatmap(corrmat, vmax=.8, square=True)
ax.set_title('CA Food Desert Data Correlations')
f.tight_layout()

## Univariate Feature Selection:
#### Pearson Correlation Coefficient
* The Pearson Correlation measures the linear relationship between X (the predictor variable) and Y (the response variable). Values can range from -1 (which represents a perfect negative correlation) to 1 (which represents a perfect positive correlation). A negative correlation suggests that as the predictor variable, X, increases in value, we are likely to see a decrease in the value of our target variable, Y. A positive correlation suggests that as the predictor variable increases in value, we are likely to see an increase in the target variable as well. A pearson correlation of 0 suggests there is no relationship between the two variables, and they are of no use in predicting the value of one another. 

______
Now that we have an understanding of what the Pearson Correlation Coefficient represents, let's use the functions defined below to determine what features are correlated with 'n_food_des,' or the number of food deserts in California counties.

In [None]:

def order(frame,var):
    varlist =[w for w in frame.columns if w not in var]
    frame = frame[var+varlist]
    return frame 

def covariance(variable_1, variable_2, bias=0):
    observations = float(len(variable_1))
    return np.sum((variable_1 - np.mean(variable_1)) * (variable_2 - np.mean(variable_2)))/(observations-min(bias,1))

def standardize(variable):
    return (variable - np.mean(variable)) / np.std(variable)

def correlation(var1,var2,bias=0):
    return covariance(standardize(var1), standardize(var2),bias)

In [None]:
cols = df.columns.tolist()
target = 'n_food_des' 
exclude = ['County','n_food_des','rural_des','urban_des','pop2010_in_des','LILATracts_1And10','des_percent','cnty_obesity_pct','cnty_inactive_pct','cnty_dm_pct', 'high_food_des_prev']
predictors = [column for column in cols if column not in exclude]
names = predictors
X = df[predictors].values
Y = df[target].values

In [None]:
def assess_feats_correlation(predictor_vars, target_var, dataset):
    feats_corr = {}
    for feature in predictor_vars:
        #print feature
        #print ('Feature Correlation estimation: %0.5f' % (correlation(dataset[feature], dataset[target_var])))
        #print '----'*5
        feats_corr[feature] = (target, correlation(dataset[feature], dataset[target_var]))
    feats_corr = pd.DataFrame(feats_corr).T
    feats_corr.columns = ['target_var', 'correlation']
    feats_corr['correlation'] = feats_corr['correlation'].astype(float)
    return feats_corr
        

# target = 'pop2010_in_des'
# pop_des_corr = assess_feats_correlation(predictors, target, df)
# pop_des_corr['correlation'].plot(kind='bar')
# plt.show()

target = 'n_food_des' 
n_food_des_corr = assess_feats_correlation(predictors, target, df)

n_food_des_corr['correlation'].plot(kind='bar')
n_food_des_corr.sort_values('correlation')


In [None]:
n_food_des_corr
#sns.distplot(df['HIV'])
#sns.distplot(standardize(df['HIV']))
pearsonr(df['HIV'], df['n_food_des'])

def assess_feats_correlation(predictor_vars, target_var, dataset):
    np.set_printoptions(precision=3, suppress=True)
    feats_corr = {}
    for feature in predictor_vars:
        #print feature
        #print ('Feature Correlation estimation: %0.5f' % (correlation(dataset[feature], dataset[target_var])))
        #print '----'*5
        feats_corr[feature] = (pearsonr(dataset[feature], dataset[target_var]))
    feats_corr = pd.DataFrame(feats_corr).T
    feats_corr.columns = ['correlation', 'pval']
    feats_corr['correlation'] = feats_corr['correlation'].astype(float)
    feats_corr['pval'] = feats_corr['pval'].astype(float)
    return feats_corr

target = 'n_food_des' 
n_food_des_corr = assess_feats_correlation(predictors, target, df)
n_food_des_corr[n_food_des_corr['pval'] < 0.05]

In [None]:
keepers = list(n_food_des_corr[n_food_des_corr['pval'] < 0.05].index)
len(df[keepers].columns.tolist())

## Accounting for Collinearity

#### Definitions:
   * Collinearity: Shared variance between two variables
   * Multi-collinearity: Shared variance among three or more variables

In the previous cells, we assessed each predictor variable's correlation with our target variable using Pearsons Correlation Coefficient. It is important to note however, this approach is only valid when measuring a relationship between two independent variables. 

Therefore, if we want to include multiple features in our linear regression model, we must assess our features for collinearity. Meaning we must determine whether "the relation between the variance of the predictor and that of the target is due to unique or shared variance" (Linear Regression, p72).

This can be done by evaluating the partial correlation of each feature we plan to utilize in our model. This value "represents the exclusive contribution of a variable in predicting the response," and will help us avoid misinterpreting collinear features as significant predictors (Linear Regression, p73).

Let's asses our features using a correlation matrix.

In [None]:
# The predictor variables that were shown to have significant correlation with our target 
# variable independently.
X = df[keepers] 
correlation_matrix = X.corr()
print correlation_matrix

In [None]:
def visualize_correlation_matrix(data, hurdle = 0.0):
    import matplotlib as mpl
    variables = data.columns.tolist()
    R = np.corrcoef(data, rowvar=0)
    R[np.where(np.abs(R)<hurdle)] = 0.0
    heatmap = plt.pcolor(R, cmap=mpl.cm.coolwarm, alpha=0.8)
    heatmap.axes.set_frame_on(False)
    heatmap.axes.set_yticks(np.arange(R.shape[0]) + 0.5, minor=False)
    heatmap.axes.set_xticks(np.arange(R.shape[1]) + 0.5, minor=False)
    heatmap.axes.set_xticklabels(variables, minor=False)
    plt.xticks(rotation=90)
    heatmap.axes.set_yticklabels(variables, minor=False)
    plt.tick_params(axis='both', which='both', bottom='off', \
    top='off', left = 'off', right = 'off')
    plt.colorbar()
    plt.show()

visualize_correlation_matrix(X, hurdle=0.5)


Having a cut at 0.5 correlation (which translates into a 25% shared variance), the heat map immediately reveals how senior_flu_deaths and PSYCH_R are not so related to other predictors.

"An even more automatic way to detect such associations (and  gure out numerical problems in a matrix inversion) is to use eigenvectors. Explained in layman's terms, eigenvectors are a very smart way to recombine the variance among the variables, creating new features accumulating all the shared variance. Such recombination
can be achieved using the NumPy linalg.eig function, resulting in a vector of eigenvalues (representing the amount of recombined variance for each new variable) and eigenvectors (a matrix telling us how the new variables relate to the old ones)" (Regression Analysis, p.74):

In [None]:
corr = np.corrcoef(X, rowvar=0)
eigenvalues, eigenvectors = np.linalg.eig(corr)

"After extracting the eigenvalues, we print them in descending order and look for any element whose value is near to zero or small compared to the others. Near zero values can represent a real problem for normal equations and other optimization methods based on matrix inversion. Small values represent a high but not critical source of multicollinearity. If you spot any of these low values, keep a note of their index in the list."(Regression Analysis, p.74)

In [None]:
def id_near_zero_eigenvalues(eig_vals):
    mc_sources = []
    for i in range(len(eigenvalues)):
        if eigenvalues[i] <= .1:
            mc_sources += [i]
    return mc_sources

print eigenvalues
possible_multicollinear_evals = id_near_zero_eigenvalues(eigenvalues)

In [None]:
variables = X.columns.tolist()

Using their index position in the list of eigenvalues, we can recall their specific vector from eigenvectors, which contains all the variable loadings—that is, the
level of association with the original variables. Our eigenvalues dictate that we should investigate the eigenvectors from index 19 to index 34. The functions below use the previously defined id_near_zero_eigenvalues and the eigenvectors to count the number of times a feature was found to have a high amount of collinearity. 

In [None]:

def exclude_collinear_vars(eig_vecs, explanatory_vars):
    good_vars = []
    eig_vecs = list(eig_vecs)
    not_collinear = [i for i in range(len(eig_vecs)) if eig_vecs[i] <= 0.1 and eig_vecs[i] >= -0.1]
    for i in not_collinear:
        good_vars += [explanatory_vars[i]]
    return good_vars#var for var in variables if not_collinear

def id_best_feats(eig_vals):
    import itertools, collections
    good_vars = []
    assess_further = id_near_zero_eigenvalues(eigenvalues)
    for i in assess_further:
        good_vars += [exclude_collinear_vars(eigenvectors[:,i], variables)]
    counter = collections.Counter(itertools.chain(*good_vars))
    times_nomc_found = pd.DataFrame([counter.values()], columns=counter.keys()).T
    times_nomc_found.columns = ['count']
    return times_nomc_found#good_vars

num_no_mc_found = id_best_feats(eigenvalues)

num_no_mc_found
# import itertools, collections
# counter = collections.Counter(itertools.chain(*num_no_mc_found))
# times_nomc_found = pd.DataFrame([counter.values()], columns=counter.keys()).T


In [None]:
times_nomc_found.columns = ['count']
times_nomc_found.describe()
least_redundant_feats = times_nomc_found[times_nomc_found >= 10].dropna().index
df[least_redundant_feats]
feats_inc_y = [val for val in least_redundant_feats]
feats_inc_y += ['n_food_des']
feats_inc_y
corrmat = df[feats_inc_y].corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))

# Draw the heatmap using seaborn, and add a title to the plot
sns.heatmap(corrmat, vmax=.8, square=True)
ax.set_title('CA Food Desert Data Correlations')
f.tight_layout()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

second_order=PolynomialFeatures(degree=2, interaction_only=False)
third_order=PolynomialFeatures(degree=3, interaction_only=True)

X_train, X_test, y_train, y_test =  train_test_split(X, Y, test_size=0.30, random_state=1)
lm = LinearRegression()
cv_iterator = KFold(n=len(X_train), n_folds=10, shuffle=True, random_state=101)
recursive_selector = RFECV(estimator=lm, step=1, cv=cv_iterator,scoring='mean_squared_error')
recursive_selector.fit(second_order.fit_transform(X_train),y_train)
print ('Initial number of features : %i' % second_order.fit_transform(X_train).shape[1])
print ('Optimal number of features : %i' % recursive_selector.n_features_)


In [None]:
recursive_selector.ranking_ 

## Bootstrapping for Selecting Stable Features

In [None]:
import random

def Bootstrap(n, n_iter=3, random_state=None):
    """
    Random sampling with replacement cross-validation generator.
    For each iter a sample bootstrap of the indexes [0, n) is
    generated and the function returns the obtained sample
    and a list of all the excluded indexes.
    """
    if random_state:
        random.seed(random_state)
    for j in range(n_iter):
        bs = [random.randint(0, n-1) for i in range(n)]
        out_bs = list({i for i in range(n)} - set(bs))
        yield bs, out_bs
            
            
boot = Bootstrap(n=58, n_iter=5, random_state=101)
for train_idx, validation_idx in boot:
    print (train_idx, validation_idx)


In [None]:
boot = Bootstrap(n=len(X), n_iter=20, random_state=101)
len(X.columns)
lm = LinearRegression()
bootstrapped_coef = np.zeros((20, len(X.columns)))
for k, (train_idx, validation_idx) in enumerate(boot):
    lm.fit(X.ix[train_idx,:],y[train_idx])
    bootstrapped_coef[k,:] = lm.coef_
    
print(bootstrapped_coef[:,10])
print X.columns.tolist()
pd.DataFrame(bootstrapped_coef, columns = X.columns).plot()

In [None]:
boot_df = pd.DataFrame(bootstrapped_coef, columns = X.columns)
sns.distplot(boot_df.std())
plt.show()
stable_feats = boot_df.std()[boot_df.std() < 1 ].index
sns.distplot(boot_df[list(stable_feats)].std())


In [None]:
boot_df[list(stable_feats)].plot()

## Ridge

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(normalize=True)
ridge.fit(second_order.fit_transform(X), y)
lm.fit(second_order.fit_transform(X), y)

print ('Average coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.mean(lm.coef_), np.mean(ridge.coef_)))
print ('Min coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.min(lm.coef_), np.min(ridge.coef_)))
print ('Max coefficient: Non regularized = %0.3f Ridge = %0.3f' % (np.max(lm.coef_), np.max(ridge.coef_)))

In [None]:
len(corr_dataframe.columns.tolist())

In [None]:
n_food_des_corr.describe()

In [None]:
print correlation(df['n_food_des'], df['num_tracts'])
linear_regression = smf.ols(formula='n_food_des ~ num_tracts', data=df)
fitted_model = linear_regression.fit()
fitted_model.summary()

In [None]:
# linear_regression = smf.ols(formula='percent_food_desert ~ unemployment_rate', data=df)
linear_regression = smf.ols(formula='n_food_des ~ pop2010_in_des+num_tracts+n_urban+n_rural+urban_des+rural_des+Rural+Urban+LILATracts_1And10+high_food_des_prev+cnty_obesity_pct+cnty_obesity_pct_adj+cnty_dm_pct+cnty_dm_pct_adj+cnty_inactive_pct+cnty_inactive_pct_adj+POP2010+OHU2010+NUMGQTRS+HUNVFlag+Adolescent_births+ABR+p_hs_edatt+PC_PHYS_R+DENTIST_R+PSYCH_R+PCT_HSPNC+PCT_WHITE+PCT_BLACK+PCT_ASIAN+PCT_AMIND_ESK+PCT_ISLANDER+PCT_MULTI+PCT_OTHER+PCT_65OVER+PCT_18_64+PCT_UNDR18+PCT_UNDER5+des_percent+unemployment_rate+n_hospitals+mort_30_ami+mort_30_cabg+mort_30_copd+mort_30_hf+mort_30_pn+mort_30_stk+readm_30_ami+readm_30_cabg+readm_30_copd+readm_30_hf+readm_30_hip_knee+readm_30_hosp_wide+readm_30_pn+readm_30_stk+Chlamydia+Tuberculosis+Gonorrhea+HIV+Measles+Mumps+Pertussis+Rubella+opiods_rx_1000+opiods_greater_than_stateavg+MILK_PRICE10+SODA_PRICE10+MILK_SODA_PRICE10+PCH_FFR_07_12+FFR07+FFR12', data=df)
#linear_regression = smf.ols(formula='pop2010_in_des ~ n_food_des+num_tracts+n_urban+n_rural+urban_des+rural_des+Rural+Urban+LILATracts_1And10+high_food_des_prev+cnty_obesity_pct+cnty_obesity_pct_adj+cnty_dm_pct+cnty_dm_pct_adj+cnty_inactive_pct+cnty_inactive_pct_adj+POP2010+OHU2010+NUMGQTRS+HUNVFlag+Adolescent_births+ABR+p_hs_edatt+PC_PHYS_R+DENTIST_R+PSYCH_R+PCT_HSPNC+PCT_WHITE+PCT_BLACK+PCT_ASIAN+PCT_AMIND_ESK+PCT_ISLANDER+PCT_MULTI+PCT_OTHER+PCT_65OVER+PCT_18_64+PCT_UNDR18+PCT_UNDER5+des_percent+unemployment_rate+n_hospitals+mort_30_ami+mort_30_cabg+mort_30_copd+mort_30_hf+mort_30_pn+mort_30_stk+readm_30_ami+readm_30_cabg+readm_30_copd+readm_30_hf+readm_30_hip_knee+readm_30_hosp_wide+readm_30_pn+readm_30_stk+Chlamydia+Tuberculosis+Gonorrhea+HIV+Measles+Mumps+Pertussis+Rubella+opiods_rx_1000+opiods_greater_than_stateavg+MILK_PRICE10+SODA_PRICE10+MILK_SODA_PRICE10+PCH_FFR_07_12+FFR07+FFR12', data=df)
fitted_model = linear_regression.fit()
fitted_model.summary()

In [None]:
df=df.fillna(0)
df

In [None]:
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor

cols = df.columns.tolist()
target = 'n_food_des' 
exclude = ['County','n_food_des','rural_des','urban_des','pop2010_in_des','LILATracts_1And10','des_percent','cnty_obesity_pct','cnty_inactive_pct','cnty_dm_pct', 'high_food_des_prev']
cols2 = [column for column in cols if column != 'n_food_des' and column != 'County']
cols2 = [column for column in cols if column not in exclude]
names = cols2
X = df[cols2].values
Y = df[target].values


rf = RandomForestRegressor(n_estimators=20, max_depth=4)
scores = []
for i in range(X.shape[1]):
    score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
    scores.append((round(np.mean(score), 3), names[i]))
print sorted(scores, reverse=True)

In [None]:
from sklearn.linear_model import RandomizedLasso
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

#Data gets scaled automatically by sklearn's implementation
cols = df.columns.tolist()
target = 'n_food_des' 
exclude = ['County','n_food_des','rural_des','urban_des','pop2010_in_des','LILATracts_1And10','des_percent','cnty_obesity_pct','cnty_inactive_pct','cnty_dm_pct', 'high_food_des_prev']
cols2 = [column for column in cols if column != 'n_food_des' and column != 'County']
cols2 = [column for column in cols if column not in exclude]
names = cols2
X = df[cols2].values
Y = df[target].values

rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(X, Y)
 
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)



In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

 
#use linear regression as the model
lr = LinearRegression()
#rank all features, i.e continue the elimination until the last one
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X,Y)
 
print "Features sorted by their rank:"
print sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

In [None]:
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from minepy import MINE
 
np.random.seed(0)
ranks = {}
 
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))
 
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names)
 
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names)
 
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names)
 
rlasso = RandomizedLasso(alpha=0.04)
rlasso.fit(X, Y)
ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names)
 
#stop the search when 5 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=5)
rfe.fit(X,Y)
ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1)
 
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = rank_to_dict(rf.feature_importances_, names)
 
f, pval  = f_regression(X, Y, center=True)
ranks["Corr."] = rank_to_dict(f, names)
 
mine = MINE()
mic_scores = []
for i in range(X.shape[1]):
   mine.compute_score(X[:,i], Y)
   m = mine.mic()
   mic_scores.append(m)

ranks["MIC"] = rank_to_dict(mic_scores, names) 
 
r = {}
for name in names:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
 
print "\t%s" % "\t".join(methods)
for name in names:
    print "%s\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods])))

In [None]:
feature_df = pd.DataFrame(ranks)
feature_df
# feature_df[:10].plot(rot=90)
# feature_df[10:20].plot(rot=90)
# feature_df[20:30].plot(rot=90)
# feature_df[30:40].plot(rot=90)
# feature_df[40:50].plot(rot=90)
# feature_df[60:70].plot(rot=90)
# feature_df[70:].plot(rot=90)

In [None]:
feature_df['Stability'].plot(kind='bar')

In [None]:
def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [None]:
print(__doc__)

# Author: Alexandre Gramfort and Gael Varoquaux
# License: BSD 3 clause

import warnings

import matplotlib.pyplot as plt
import numpy as np
from scipy import linalg

from sklearn.linear_model import (RandomizedLasso, lasso_stability_path,
                                  LassoLarsCV)
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import auc, precision_recall_curve
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.utils.extmath import pinvh
from sklearn.utils import ConvergenceWarning


def mutual_incoherence(X_relevant, X_irelevant):
    """Mutual incoherence, as defined by formula (26a) of [Wainwright2006].
    """
    projector = np.dot(np.dot(X_irelevant.T, X_relevant),
                       pinvh(np.dot(X_relevant.T, X_relevant)))
    return np.max(np.abs(projector).sum(axis=1))


for conditioning in (1, 1e-4):
    ###########################################################################
#     # Simulate regression data with a correlated design
    n_features = 65
#     n_relevant_features = 3
#     noise_level = .2
#     coef_min = .2
#     # The Donoho-Tanner phase transition is around n_samples=25: below we
#     # will completely fail to recover in the well-conditioned case
#     n_samples = 25
#     block_size = n_relevant_features

#     rng = np.random.RandomState(42)

#     # The coefficients of our model
    coef = np.zeros(n_features)
#     coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

#     # The correlation of our design: variables correlated by blocs of 3
#     corr = np.zeros((n_features, n_features))
#     for i in range(0, n_features, block_size):
#         corr[i:i + block_size, i:i + block_size] = 1 - conditioning
#     corr.flat[::n_features + 1] = 1
#     corr = linalg.cholesky(corr)

#     # Our design
#     X = rng.normal(size=(n_samples, n_features))
#     X = np.dot(X, corr)
#     # Keep [Wainwright2006] (26c) constant
#     X[:n_relevant_features] /= np.abs(
#         linalg.svdvals(X[:n_relevant_features])).max()
#     X = StandardScaler().fit_transform(X.copy())

#     # The output variable
#     y = np.dot(X, coef)
#     y /= np.std(y)
#     # We scale the added noise as a function of the average correlation
#     # between the design and the output variable
#     y += noise_level * rng.normal(size=n_samples)
#     mi = mutual_incoherence(X[:, :n_relevant_features],
#                             X[:, n_relevant_features:])

    ###########################################################################
    # Plot stability selection path, using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
                                                   eps=0.05)

    plt.figure()
    # We plot the path as a function of alpha/alpha_max to the power 1/3: the
    # power 1/3 scales the path less brutally than the log, and enables to
    # see the progression along the path
    hg = plt.plot(alpha_grid[1:] ** .333, scores_path[coef != 0].T[1:], 'r')
    hb = plt.plot(alpha_grid[1:] ** .333, scores_path[coef == 0].T[1:], 'k')
    ymin, ymax = plt.ylim()
    plt.xlabel(r'$(\alpha / \alpha_{max})^{1/3}$')
    plt.ylabel('Stability score: proportion of times selected')
    plt.title('Stability Scores Path - Mutual incoherence: %.1f' % mi)
    plt.axis('tight')
    plt.legend((hg[0], hb[0]), ('relevant features', 'irrelevant features'),
               loc='best')

    ###########################################################################
    # Plot the estimated stability scores for a given alpha

    # Use 6-fold cross-validation rather than the default 3-fold: it leads to
    # a better choice of alpha:
    # Stop the user warnings outputs- they are not necessary for the example
    # as it is specifically set up to be challenging.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        warnings.simplefilter('ignore', ConvergenceWarning)
        lars_cv = LassoLarsCV(cv=6).fit(X, y)

    # Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
    trees = ExtraTreesRegressor(100).fit(X, y)
    # Compare with F-score
    F, _ = f_regression(X, y)

    plt.figure()
    for name, score in [('F-test', F),
                        ('Stability selection', clf.scores_),
                        ('Lasso coefs', np.abs(lars_cv.coef_)),
                        ('Trees', trees.feature_importances_),
                        ]:
        precision, recall, thresholds = precision_recall_curve(coef != 0,
                                                               score)
        plt.semilogy(np.maximum(score / np.max(score), 1e-4),
                     label="%s. AUC: %.3f" % (name, auc(recall, precision)))

    plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo',
             label="Ground truth")
    plt.xlabel("Features")
    plt.ylabel("Score")
    # Plot only the 100 first coefficients
    plt.xlim(0, 100)
    plt.legend(loc='best')
    plt.title('Feature selection scores - Mutual incoherence: %.1f'
              % mi)

plt.show()

In [None]:
df.columns.tolist()
#sns.pairplot(df[['n_food_des', 'num_tracts','n_urban','n_rural', 'Rural', 'Urban','cnty_obesity_pct_adj', 'cnty_dm_pct_adj','cnty_inactive_pct_adj', 'POP2010','OHU2010','NUMGQTRS','HUNVFlag','Adolescent_births','ABR']])
#import statsmodels.api as sm
from scipy.stats.mstats import zscore
y = df['n_food_des']
x = df['Adolescent_births']
sm.OLS(zscore(y), zscore(x)).fit().summary()

In [None]:

def order(frame,var):
    varlist =[w for w in frame.columns if w not in var]
    frame = frame[var+varlist]
    return frame 

def covariance(variable_1, variable_2, bias=0):
    observations = float(len(variable_1))
    return np.sum((variable_1 - np.mean(variable_1)) * (variable_2 - np.mean(variable_2)))/(observations-min(bias,1))

def standardize(variable):
    return (variable - np.mean(variable)) / np.std(variable)

def correlation(var1,var2,bias=0):
    return covariance(standardize(var1), standardize(var2),bias)

In [None]:
df=order(df, ['County','n_food_des','pop2010_in_des'])
df=df.fillna(0)

## Target Variables:
* 'n_food_des' - What counties tend to have a high number of food deserts?
* 'pop2010_in_des' - What counties are most effected by food deserts?

In [None]:
df.columns.tolist()

In [None]:
sns.pairplot(df[df.columns.tolist()[:10]])