In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [3]:
non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']

def load_ansur(cols_to_drop):
    df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
    df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
    ansur_df = pd.concat([df_m, df_f], axis=0)

    X = ansur_df.drop(non_numeric, axis=1)
    y = ansur_df['Gender']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return X, y, X_train, X_test, y_train, y_test

In [4]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

### Creating a logistic regression model

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)

In [6]:
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

1.0


### Inspecting the feature coefficients 

In [7]:
print(lr.coef_[0][:10])

[ 0.09777254  0.12410134 -0.17732402 -0.34330663 -0.04441239 -0.10595977
  0.19960551  0.70085036  0.76774048 -0.79175596]


In [8]:
coef_dict = dict(zip(X.columns, abs(lr.coef_[0])))

{k: v for i, (k, v) in enumerate(coef_dict.items()) if i < 10}

{'abdominalextensiondepthsitting': 0.09777253652707805,
 'acromialheight': 0.12410134428349003,
 'acromionradialelength': 0.17732402491484334,
 'anklecircumference': 0.34330663292165575,
 'axillaheight': 0.0444123936761637,
 'balloffootcircumference': 0.10595977382708337,
 'balloffootlength': 0.19960551153544623,
 'biacromialbreadth': 0.7008503626621028,
 'bicepscircumferenceflexed': 0.767740480518377,
 'bicristalbreadth': 0.791755956664065}

In [9]:
low_coef = {k: v for k, v in coef_dict.items() if v < .401}

cols = [k for k, v in low_coef.items()]

In [10]:
X.drop(cols, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

1.0


In [11]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [12]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5, verbose=0)
rfe.fit(X_train_std, y_train)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l2',
                                 random_state=None, solver='lbfgs', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=5, step=1, verbose=0)

In [13]:
X.columns[rfe.support_]

Index(['biacromialbreadth', 'bicepscircumferenceflexed', 'chestdepth',
       'hipbreadthsitting', 'neckcircumference'],
      dtype='object')

In [14]:
#print(dict(zip(X.columns, rfe.ranking_)))

In [15]:
print(accuracy_score(y_test, rfe.predict(X_test_std)))

0.9956068094453597


In [16]:
def load_pima(cols_to_drop):
    df = pd.read_csv('data/PimaIndians.csv')

    X = df.drop(cols_to_drop, axis=1)
    y = df['test']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return X, y, X_train, X_test, y_train, y_test

X, y, X_train, X_test, y_train, y_test = load_pima('test')

scaler = StandardScaler()
lr = LogisticRegression()

In [17]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

78.0% accuracy on test set.
{'pregnant': 0.27, 'glucose': 1.17, 'diastolic': 0.06, 'triceps': 0.16, 'insulin': 0.13, 'bmi': 0.56, 'family': 0.34, 'age': 0.32}


In [18]:
X, y, X_train, X_test, y_train, y_test = load_pima('test')

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train_std, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test_std))
print("{0:.1%} accuracy on test set.".format(acc)) 

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 3, 'glucose': 1, 'diastolic': 6, 'triceps': 5, 'insulin': 4, 'bmi': 1, 'family': 2, 'age': 1}
Index(['glucose', 'bmi', 'age'], dtype='object')
82.2% accuracy on test set.


In [19]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
print(accuracy_score(y_test, rf.predict(X_test)))

0.9928610653487095


In [22]:
print(rf.feature_importances_)

[0.00102682 0.000709   0.00290473 0.00078562 0.00089664 0.01165197
 0.00094452 0.09641113 0.0041592  0.00572928 0.01404881 0.01677186
 0.0009335  0.00284188 0.00099539 0.0119006  0.00357585 0.00055406
 0.002149   0.00117575 0.00189102 0.00758936 0.00087306 0.00163212
 0.00675128 0.02683303 0.00066478 0.00550082 0.00051414 0.00094254
 0.00088809 0.00206574 0.00047947 0.00466727 0.01727022 0.01498999
 0.00463261 0.0330843  0.00804855 0.00088816 0.0004025  0.07012903
 0.06512801 0.00051525 0.00155923 0.00094931 0.00074567 0.0234489
 0.00045068 0.01233253 0.02888353 0.00067019 0.00041721 0.00332492
 0.00914504 0.00032144 0.00050347 0.00080526 0.0062136  0.0052436
 0.00163285 0.11690865 0.09823149 0.00633823 0.00042961 0.00682375
 0.00643037 0.04959145 0.00085367 0.00095863 0.00392878 0.02870384
 0.00108215 0.010089   0.00093779 0.00120349 0.00810497 0.00177173
 0.00064047 0.00067062 0.00052553 0.00072461 0.00100291 0.01489107
 0.0015061  0.00165098 0.00074522 0.00055309 0.00061034 0.063734

In [23]:
mask = rf.feature_importances_ > 0.03
print(mask)

[False False False False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False  True False False False  True  True False False False False False
 False False False False False False False False False False False False
 False  True  True False False False False  True False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False]


In [24]:
X_reduced = X_train.loc[:, mask]
print(X_reduced.columns)

Index(['biacromialbreadth', 'forearmcircumferenceflexed', 'handbreadth',
       'handcircumference', 'neckcircumference', 'neckcircumferencebase',
       'shouldercircumference', 'wristcircumference'],
      dtype='object')


### RFE with random forests

In [25]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=RandomForestClassifier(), 
          n_features_to_select=6, step=10, 
            verbose=1)

rfe.fit(X_train, y_train)

Fitting estimator with 94 features.
Fitting estimator with 84 features.
Fitting estimator with 74 features.
Fitting estimator with 64 features.
Fitting estimator with 54 features.
Fitting estimator with 44 features.
Fitting estimator with 34 features.
Fitting estimator with 24 features.
Fitting estimator with 14 features.


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=6, step=10, verbose=1)

In [26]:
print(X_train.columns[rfe.support_])

Index(['biacromialbreadth', 'handbreadth', 'handcircumference',
       'neckcircumference', 'neckcircumferencebase', 'shouldercircumference'],
      dtype='object')


In [27]:
X, y, X_train, X_test, y_train, y_test = load_pima('test')

In [28]:
# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X_train.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc))

{'pregnant': 0.08, 'glucose': 0.24, 'diastolic': 0.07, 'triceps': 0.09, 'insulin': 0.15, 'bmi': 0.1, 'family': 0.11, 'age': 0.15}
78.0% accuracy on test set.


In [29]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

[False  True False False  True False False  True]


In [30]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Apply the mask to the feature dataset X
reduced_X = X_train.loc[:, mask]

# prints out the selected column names
print(reduced_X.columns)

Index(['glucose', 'insulin', 'age'], dtype='object')


In [31]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

In [32]:
# Fit the model to the training data
rfe.fit(X_train, y_train)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.


RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=2, step=1, verbose=1)

In [33]:
!../gitbsh

[master 1eb3912] 2020-02-04
 1 file changed, 389 insertions(+), 117 deletions(-)
Counting objects: 4, done.
Delta compression using up to 4 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 3.01 KiB | 0 bytes/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To git@github.com:MikSm1th/datacamp_notes.git
   7f08f81..1eb3912  master -> master
Committed: 2020-02-04
