In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [3]:
df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
ansur_df = pd.concat([df_m, df_f], axis=0)

In [4]:
from sklearn.model_selection import train_test_split

non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']

X = ansur_df.drop(non_numeric, axis=1)
y = ansur_df['Gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train)

### Creating a logistic regression model

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_std, y_train)

X_test_std = scaler.transform(X_test)

In [7]:
y_pred = lr.predict(X_test_std)
print(accuracy_score(y_test, y_pred))

1.0


### Inspecting the feature coefficients 

In [8]:
print(lr.coef_[0][:10])

[ 0.13070719  0.13000421 -0.13544264 -0.37586946 -0.02865747 -0.14524606
  0.23776275  0.68718176  0.68175807 -0.80815808]


In [9]:
coef_dict = dict(zip(X.columns, abs(lr.coef_[0])))

{k: v for i, (k, v) in enumerate(coef_dict.items()) if i < 10}

{'abdominalextensiondepthsitting': 0.1307071903280662,
 'acromialheight': 0.13000420973242838,
 'acromionradialelength': 0.13544263796148184,
 'anklecircumference': 0.37586946479712596,
 'axillaheight': 0.02865747049994805,
 'balloffootcircumference': 0.14524605720370898,
 'balloffootlength': 0.2377627528859949,
 'biacromialbreadth': 0.6871817575248153,
 'bicepscircumferenceflexed': 0.6817580731621957,
 'bicristalbreadth': 0.8081580804251913}

In [10]:
low_coef = {k: v for k, v in coef_dict.items() if v < .401}

cols = [k for k, v in low_coef.items()]

In [11]:
X.drop(cols, axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

lr.fit(scaler.fit_transform(X_train), y_train)

print(accuracy_score(y_test, lr.predict(scaler.transform(X_test))))

1.0


In [12]:
!../gitbsh

[master a6bd6d2] 2020-02-04
 1 file changed, 28 insertions(+)
Counting objects: 4, done.
Delta compression using up to 4 threads.
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 801 bytes | 0 bytes/s, done.
Total 4 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To git@github.com:MikSm1th/datacamp_notes.git
   7363c0e..a6bd6d2  master -> master
Committed: 2020-02-04
