In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [45]:
non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']
regression = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch', 'BMI']

def load_ansur(cols_to_drop, test_size, model_type='class'):
    df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
    df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
    ansur_df = pd.concat([df_m, df_f], axis=0)
    
    if model_type == 'class': 
        X = ansur_df.drop(non_numeric, axis=1)
        y = ansur_df['Gender']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    elif model_type == 'reg':
        X = ansur_df.drop(regression, axis=1)
        y = ansur_df["BMI"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)
    else:
        print('please specify model type')
        
    return X, y, X_train, X_test, y_train, y_test

In [4]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

### Creating a logistic regression model

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)

In [6]:
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

1.0


### Inspecting the feature coefficients 

In [7]:
print(lr.coef_[0][:10])

[ 0.08402224  0.1125312  -0.17673478 -0.38215931 -0.04257814 -0.04775831
  0.38436549  0.60495505  0.58372044 -0.7008085 ]


In [8]:
coef_dict = dict(zip(X.columns, abs(lr.coef_[0])))

{k: v for i, (k, v) in enumerate(coef_dict.items()) if i < 10}

{'abdominalextensiondepthsitting': 0.08402224035469937,
 'acromialheight': 0.11253120359787197,
 'acromionradialelength': 0.17673477666427256,
 'anklecircumference': 0.38215931273441955,
 'axillaheight': 0.04257813798313884,
 'balloffootcircumference': 0.04775830704261175,
 'balloffootlength': 0.38436549110429374,
 'biacromialbreadth': 0.60495505215897,
 'bicepscircumferenceflexed': 0.5837204355527865,
 'bicristalbreadth': 0.7008084982203819}

In [9]:
low_coef = {k: v for k, v in coef_dict.items() if v < .401}

cols = [k for k, v in low_coef.items()]

In [10]:
X.drop(cols, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

1.0


In [11]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [12]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5, verbose=0)
rfe.fit(X_train_std, y_train)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=5, step=1, verbose=0)

In [13]:
X.columns[rfe.support_]

Index(['chestdepth', 'hipbreadthsitting', 'neckcircumference',
       'shouldercircumference', 'waistbacklength'],
      dtype='object')

In [14]:
#print(dict(zip(X.columns, rfe.ranking_)))

In [15]:
print(accuracy_score(y_test, rfe.predict(X_test_std)))

0.9950576606260296


In [16]:
def load_pima(cols_to_drop):
    df = pd.read_csv('data/PimaIndians.csv')

    X = df.drop(cols_to_drop, axis=1)
    y = df['test']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return X, y, X_train, X_test, y_train, y_test

X, y, X_train, X_test, y_train, y_test = load_pima('test')

scaler = StandardScaler()
lr = LogisticRegression()

In [17]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

79.7% accuracy on test set.
{'pregnant': 0.05, 'glucose': 1.21, 'diastolic': 0.08, 'triceps': 0.15, 'insulin': 0.15, 'bmi': 0.44, 'family': 0.39, 'age': 0.4}


In [18]:
X, y, X_train, X_test, y_train, y_test = load_pima('test')

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train_std, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test_std))
print("{0:.1%} accuracy on test set.".format(acc)) 

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 4, 'glucose': 1, 'diastolic': 3, 'triceps': 6, 'insulin': 5, 'bmi': 1, 'family': 2, 'age': 1}
Index(['glucose', 'bmi', 'age'], dtype='object')
74.6% accuracy on test set.


In [19]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
print(accuracy_score(y_test, rf.predict(X_test)))

0.985172981878089


In [22]:
print(rf.feature_importances_)

[2.20559406e-03 1.05586720e-03 8.99116235e-05 6.00281198e-04
 6.66486775e-04 2.83791670e-03 1.03215409e-04 7.97763202e-02
 6.71589414e-03 1.06308404e-02 2.15797365e-03 4.81766383e-02
 2.13048570e-03 1.77328447e-03 7.82481107e-04 1.46036092e-02
 4.32135175e-03 1.48485580e-04 1.30086529e-03 2.16774038e-03
 3.06981783e-03 6.91710442e-04 2.18635682e-03 1.41535038e-03
 6.07428435e-03 6.41092461e-02 2.94235926e-04 7.41402394e-03
 1.04520221e-03 1.13129269e-03 2.90625977e-04 1.37261045e-03
 2.46668183e-04 2.93978267e-03 7.60303476e-04 4.45806521e-02
 7.23791984e-04 5.67690366e-02 3.48224961e-03 8.63198792e-04
 6.07897313e-04 6.76984227e-02 8.57388751e-02 5.65540297e-04
 1.74761641e-03 9.05855248e-04 9.94209883e-04 1.85861943e-03
 1.01204749e-03 1.44559387e-02 1.86397493e-02 4.40297730e-04
 2.04611572e-04 3.03323353e-03 3.66760222e-03 5.88982451e-04
 3.32722758e-04 3.90562246e-04 1.59699055e-02 4.71527464e-03
 1.94681143e-03 7.29351683e-02 1.01236598e-01 6.95936187e-04
 4.51981094e-04 9.827695

In [23]:
mask = rf.feature_importances_ > 0.03
print(mask)

[False False False False False False False  True False False False  True
 False False False False False False False False False False False False
 False  True False False False False False False False False False  True
 False  True False False False  True  True False False False False False
 False False False False False False False False False False False False
 False  True  True False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False]


In [24]:
X_reduced = X_train.loc[:, mask]
print(X_reduced.columns)

Index(['biacromialbreadth', 'bimalleolarbreadth', 'chestheight', 'footlength',
       'forearmcircumferenceflexed', 'handbreadth', 'handcircumference',
       'neckcircumference', 'neckcircumferencebase', 'shouldercircumference'],
      dtype='object')


### RFE with random forests

In [25]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(), 
          n_features_to_select=6, step=10, 
            verbose=1)

rfe.fit(X_train, y_train)

Fitting estimator with 94 features.
Fitting estimator with 84 features.
Fitting estimator with 74 features.
Fitting estimator with 64 features.
Fitting estimator with 54 features.
Fitting estimator with 44 features.
Fitting estimator with 34 features.
Fitting estimator with 24 features.
Fitting estimator with 14 features.


RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
  n_features_to_select=6, step=10, verbose=1)

In [26]:
print(X_train.columns[rfe.support_])

Index(['biacromialbreadth', 'handbreadth', 'hipbreadthsitting',
       'neckcircumference', 'neckcircumferencebase', 'shouldercircumference'],
      dtype='object')


In [27]:
X, y, X_train, X_test, y_train, y_test = load_pima('test')

In [28]:
# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X_train.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc))

{'pregnant': 0.08, 'glucose': 0.18, 'diastolic': 0.07, 'triceps': 0.13, 'insulin': 0.14, 'bmi': 0.1, 'family': 0.07, 'age': 0.21}
78.0% accuracy on test set.


In [29]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

[False  True False False False False False  True]


In [30]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Apply the mask to the feature dataset X
reduced_X = X_train.loc[:, mask]

# prints out the selected column names
print(reduced_X.columns)

Index(['glucose', 'age'], dtype='object')


In [31]:
X, y, X_train, X_test, y_train, y_test = load_pima('test')

In [32]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask using an attribute of rfe
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 4 features.
Index(['glucose', 'age'], dtype='object')


In [33]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

In [34]:
from sklearn.linear_model import Lasso
y = X["BMI"]
X= X.drop("BMI", axis=1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [36]:
# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print("The model has ignored {} out of {} features.".format(n_ignored, len(la.coef_)))

The model can predict 82.9% of the variance in the test set.
The model has ignored 83 out of 93 features.


In [37]:
alphas = [1, 0.5, 0.1, 0.01]
for a in alphas:
    # Find the highest alpha value with R-squared above 98%
    la = Lasso(alpha=a, random_state=0)

    # Fits the model and calculates performance stats
    la.fit(X_train_std, y_train)
    r_squared = la.score(X_test_std, y_test)
    n_ignored_features = sum(la.coef_ == 0)

    # Print peformance stats 
    print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))
    print("{} out of {} features were ignored.".format(n_ignored_features, len(la.coef_)))

The model can predict 82.9% of the variance in the test set.
83 out of 93 features were ignored.
The model can predict 90.8% of the variance in the test set.
82 out of 93 features were ignored.
The model can predict 98.5% of the variance in the test set.
69 out of 93 features were ignored.
The model can predict 99.3% of the variance in the test set.
53 out of 93 features were ignored.


### Combining feature selectors 
#### Taking a step back
- Random forest is combination of decision trees 
- We can use combination of models for feature selection too

131 and union 

In [47]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric, 0.25, 'reg')

In [48]:
from sklearn.linear_model import LassoCV

lcv = LassoCV()
lcv.fit(X_train, y_train)

lcv.score(X_test, y_test)

0.9895676219048437

In [49]:
lcv_mask = lcv.coef_ != 0
sum(lcv_mask)

38

In [50]:
from sklearn.ensemble import RandomForestRegressor

rfe_rf = RFE(estimator=RandomForestRegressor(),
            n_features_to_select=38, step=5, verbose=5)

rfe_rf.fit(X_train, y_train)

Fitting estimator with 93 features.
Fitting estimator with 88 features.
Fitting estimator with 83 features.
Fitting estimator with 78 features.
Fitting estimator with 73 features.
Fitting estimator with 68 features.
Fitting estimator with 63 features.
Fitting estimator with 58 features.
Fitting estimator with 53 features.
Fitting estimator with 48 features.
Fitting estimator with 43 features.


RFE(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
  n_features_to_select=38, step=5, verbose=5)

In [52]:
rf_mask = rfe_rf.support_

In [54]:
from sklearn.ensemble import GradientBoostingRegressor

rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
            n_features_to_select=38, step=5, verbose=1)

rfe_gb.fit(X_train, y_train)

Fitting estimator with 93 features.
Fitting estimator with 88 features.
Fitting estimator with 83 features.
Fitting estimator with 78 features.
Fitting estimator with 73 features.
Fitting estimator with 68 features.
Fitting estimator with 63 features.
Fitting estimator with 58 features.
Fitting estimator with 53 features.
Fitting estimator with 48 features.
Fitting estimator with 43 features.


RFE(estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
  n_features_to_select=38, step=5, verbose=1)

In [55]:
gb_mask = rfe_gb.support_

In [56]:
votes =np.sum([lcv_mask, rf_mask, gb_mask], axis=0)
print(votes)

[3 1 0 1 3 0 0 0 3 0 3 0 1 0 2 2 3 0 0 1 3 2 0 3 3 1 1 1 1 0 0 0 1 1 1 0 0
 3 3 0 2 0 1 0 0 1 0 1 1 0 1 3 1 3 0 1 0 2 0 3 0 3 0 1 0 2 0 2 0 0 2 1 0 1
 2 2 3 2 1 3 0 1 1 2 0 3 3 1 3 1 2 3 2]


In [57]:
mask = votes >= 2

In [60]:
reduced_X = X.loc[:, mask]
#reduced_X

In [None]:
!jupyter nbconvert --to html 3_Screening_for_model_accuracy.ipynb

In [None]:
!../gitbsh > /dev/null 2>&1