In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [3]:
df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
ansur_df = pd.concat([df_m, df_f], axis=0)

In [4]:
from sklearn.model_selection import train_test_split

non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']

X = ansur_df.drop(non_numeric, axis=1)
y = ansur_df['Gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)

### Creating a logistic regression model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)

In [7]:
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

0.99945085118067


### Inspecting the feature coefficients 

In [8]:
print(lr.coef_[0][:10])

[ 0.15981007  0.12013785  0.02855193 -0.26125167 -0.06859919 -0.04579309
  0.27280256  0.7415875   0.7315371  -0.75536158]


In [9]:
coef_dict = dict(zip(X.columns, abs(lr.coef_[0])))

{k: v for i, (k, v) in enumerate(coef_dict.items()) if i < 10}

{'abdominalextensiondepthsitting': 0.15981007109768466,
 'acromialheight': 0.12013784547633599,
 'acromionradialelength': 0.02855192888666946,
 'anklecircumference': 0.2612516724723182,
 'axillaheight': 0.06859919360637151,
 'balloffootcircumference': 0.04579308826386677,
 'balloffootlength': 0.27280256408800196,
 'biacromialbreadth': 0.7415875017798398,
 'bicepscircumferenceflexed': 0.7315371039033777,
 'bicristalbreadth': 0.7553615792908718}

In [20]:
low_coef = {k: v for k, v in coef_dict.items() if v < .401}

cols = [k for k, v in low_coef.items()]

In [21]:
X.drop(cols, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

0.99945085118067
