In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [3]:
df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
ansur_df = pd.concat([df_m, df_f], axis=0)

In [4]:
from sklearn.model_selection import train_test_split

non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']

X = ansur_df.drop(non_numeric, axis=1)
y = ansur_df['Gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)

### Creating a logistic regression model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)

In [7]:
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

1.0


### Inspecting the feature coefficients 

In [8]:
print(lr.coef_[0][:10])

[ 0.04742924  0.09439899 -0.13213384 -0.27824795 -0.08765584 -0.08159093
  0.22131329  0.79911497  0.66546261 -0.72941953]


In [9]:
coef_dict = dict(zip(X.columns, abs(lr.coef_[0])))

{k: v for i, (k, v) in enumerate(coef_dict.items()) if i < 10}

{'abdominalextensiondepthsitting': 0.04742923615806754,
 'acromialheight': 0.09439898904685105,
 'acromionradialelength': 0.13213384495868946,
 'anklecircumference': 0.2782479457516087,
 'axillaheight': 0.08765583853722264,
 'balloffootcircumference': 0.08159092507245104,
 'balloffootlength': 0.22131329174333864,
 'biacromialbreadth': 0.7991149695007576,
 'bicepscircumferenceflexed': 0.6654626133665815,
 'bicristalbreadth': 0.7294195347814207}

In [10]:
low_coef = {k: v for k, v in coef_dict.items() if v < .401}

cols = [k for k, v in low_coef.items()]

In [11]:
X.drop(cols, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

1.0


In [45]:
non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']

X = ansur_df.drop(non_numeric, axis=1)
y = ansur_df['Gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [46]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=LogisticRegression(), n_features_to_select=5, verbose=0)
rfe.fit(X_train_std, y_train)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='auto', n_jobs=None, penalty='l2',
                                 random_state=None, solver='lbfgs', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=5, step=1, verbose=0)

In [47]:
X.columns[rfe.support_]

Index(['chestheight', 'hipbreadthsitting', 'neckcircumference',
       'shouldercircumference', 'tenthribheight'],
      dtype='object')

In [48]:
#print(dict(zip(X.columns, rfe.ranking_)))

In [49]:
print(accuracy_score(y_test, rfe.predict(X_test_std)))

0.9961559582646897


In [50]:
df = pd.read_csv('data/PimaIndians.csv')

X = df.drop('test', axis=1)
y = df['test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()
lr = LogisticRegression()

In [53]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

76.3% accuracy on test set.
{'pregnant': 0.37, 'glucose': 1.08, 'diastolic': 0.08, 'triceps': 0.15, 'insulin': 0.1, 'bmi': 0.37, 'family': 0.32, 'age': 0.45}


In [54]:
# Remove the feature with the lowest model coefficient
X = df[['pregnant', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

79.6% accuracy on test set.
{'pregnant': 0.05, 'glucose': 1.23, 'diastolic': 0.03, 'triceps': 0.24, 'insulin': 0.19, 'bmi': 0.38, 'family': 0.35, 'age': 0.34}


In [59]:
df = pd.read_csv('data/PimaIndians.csv')

X = df.drop('test', axis=1)
y = df['test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train_std, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test_std))
print("{0:.1%} accuracy on test set.".format(acc)) 

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 3, 'glucose': 1, 'diastolic': 4, 'triceps': 6, 'insulin': 5, 'bmi': 1, 'family': 2, 'age': 1}
Index(['glucose', 'bmi', 'age'], dtype='object')
79.7% accuracy on test set.


In [12]:
!../gitbsh

[master b92b782] 2020-02-04
 1 file changed, 26 insertions(+), 27 deletions(-)
Counting objects: 4, done.
Delta compression using up to 4 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 831 bytes | 0 bytes/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To git@github.com:MikSm1th/datacamp_notes.git
   a6bd6d2..b92b782  master -> master
Committed: 2020-02-04
