In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

In [3]:
non_numeric = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch']
regression = ['BMI_class', 'Height_class', 'Gender', 'Component', 'Branch', 'BMI']

def load_ansur(cols_to_drop, test_size, model_type='class'):
    df_m = pd.read_csv('data/ANSUR_II_MALE.csv')
    df_f = pd.read_csv('data/ANSUR_II_FEMALE.csv')
    ansur_df = pd.concat([df_m, df_f], axis=0)
    
    if model_type == 'class': 
        X = ansur_df.drop(cols_to_drop, axis=1)
        y = ansur_df['Gender']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    elif model_type == 'reg':
        X = ansur_df.drop(cols_to_drop, axis=1)
        y = ansur_df["BMI"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    else:
        print('please specify model type')
        
    return X, y, X_train, X_test, y_train, y_test

In [4]:
def load_pima(cols_to_drop):
    df = pd.read_csv('data/PimaIndians.csv')

    X = df.drop(cols_to_drop, axis=1)
    y = df['test']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return X, y, X_train, X_test, y_train, y_test

X, y, X_train, X_test, y_train, y_test = load_pima('test')

# Feature extraction 
### Feature generation 

In [5]:
sales_df = pd.read_csv('data/grocery_sales.csv')
sales_df.head(3)

Unnamed: 0,storeID,product,quantity,revenue
0,A,Apples,1811,9300.6
1,A,Bananas,1003,3375.2
2,A,Oranges,1604,8528.5


In [6]:
# Calculate the price from the quantity sold and revenue
sales_df['price'] = sales_df.revenue/sales_df.quantity

# Drop the quantity and revenue features
reduced_df = sales_df.drop(['quantity', 'revenue'], axis=1)

print(reduced_df.head())

  storeID  product     price
0       A   Apples  5.135616
1       A  Bananas  3.365105
2       A  Oranges  5.317020
3       B   Apples  5.143417
4       B  Bananas  3.898517


In [7]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric, 0.3)
y = y.map({"Male":1, "Female":0})
df = X
df['gender'] = y
#df.info()

### Principle component analysis

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
std_df = scaler.fit_transform(df)

from sklearn.decomposition import PCA

pca = PCA()
print(pca.fit_transform(std_df))

[[-3.51674371e+00  1.59374878e+00 -8.08230517e-01 ... -4.36363115e-02
   5.28128161e-02  1.52702201e-02]
 [ 8.30842190e-01  2.06436542e+00 -1.25006217e+00 ...  5.15906913e-02
  -2.67285268e-02  1.98695000e-02]
 [-5.71342970e+00 -1.17930607e+00 -1.09541044e+00 ... -7.09899034e-02
  -3.26406702e-03 -6.79468060e-02]
 ...
 [ 3.92273351e+00 -2.42152121e+00  2.42973003e+00 ...  1.08307643e-01
  -1.30813641e-01  1.82733652e-02]
 [ 1.08510311e+01 -1.41402737e+00  1.45659251e+00 ...  1.51538913e-01
   2.51262482e-02  6.86466609e-02]
 [ 7.75056165e+00  1.05226348e+00  3.32876020e-01 ...  5.31085541e-02
   2.37015728e-02  1.09083700e-02]]


In [9]:
X.columns

Index(['abdominalextensiondepthsitting', 'acromialheight',
       'acromionradialelength', 'anklecircumference', 'axillaheight',
       'balloffootcircumference', 'balloffootlength', 'biacromialbreadth',
       'bicepscircumferenceflexed', 'bicristalbreadth', 'bideltoidbreadth',
       'bimalleolarbreadth', 'bitragionchinarc', 'bitragionsubmandibulararc',
       'bizygomaticbreadth', 'buttockcircumference', 'buttockdepth',
       'buttockheight', 'buttockkneelength', 'buttockpopliteallength',
       'calfcircumference', 'cervicaleheight', 'chestbreadth',
       'chestcircumference', 'chestdepth', 'chestheight', 'crotchheight',
       'crotchlengthomphalion', 'crotchlengthposterioromphalion', 'earbreadth',
       'earlength', 'earprotrusion', 'elbowrestheight', 'eyeheightsitting',
       'footbreadthhorizontal', 'footlength', 'forearmcenterofgriplength',
       'forearmcircumferenceflexed', 'forearmforearmbreadth',
       'forearmhandlength', 'functionalleglength', 'handbreadth',
      

In [10]:
ansur_df = X[['stature_m', 'buttockheight', 'waistcircumference', 'shouldercircumference']]

In [None]:
sns.pairplot(ansur_df)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

# Create the scaler and standardize the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pc = pca.fit_transform(ansur_std)

# This changes the numpy array output back to a dataframe
pc_df = pd.DataFrame(pc, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4'])

In [None]:
# Create a pairplot of the principal component dataframe
sns.pairplot(pc_df)
plt.show()

Notice how, in contrast to the input features, none of the principal components are correlated to one another.

In [None]:
ansur_df = X[['stature_m', 'buttockheight', 'waistdepth', 'span', 'waistcircumference', 'shouldercircumference', 
              'footlength', 'handlength', 'functionalleglength', 'chestheight', 'chestcircumference',
              'cervicaleheight', 'sittingheight']]

In [None]:
# Scale the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Apply PCA
pca = PCA()
pca.fit(ansur_std)

In [None]:
# Inspect the explained variance ratio per component
print(pca.explained_variance_ratio_)

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA())
])
pc = pipe.fit_transform(ansur_df)

print(pc[:,:2])

In [None]:
X, y, X_train, X_test, y_train, y_test = load_ansur('span', 0.3)

In [None]:
ansur_categories = pd.DataFrame()
ansur_categories['PC 1'] = pc[:,0]
ansur_categories['PC 2'] = pc[:,1]
ansur_categories['Height_class'] = X['Height_class'].values
ansur_categories['BMI_class'] = X['BMI_class'].values

In [None]:
sns.scatterplot(data=ansur_categories,
               x='PC 1', y='PC 2',
               hue='Height_class',
               alpha= 0.4)

plt.show()

In [None]:
sns.scatterplot(data=ansur_categories,
               x='PC 1', y='PC 2',
               hue='BMI_class',
               alpha= 0.4)

plt.show()

In [None]:
X, y, X_train, X_test, y_train, y_test = load_ansur(non_numeric, 0.3)

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reducer', PCA(n_components=3)),
    ('classifier', RandomForestClassifier())
])
pipe.fit(X_train, y_train)

In [None]:
print(pipe.steps[1])

In [None]:
pipe.steps[1][1].explained_variance_ratio_.cumsum()

In [None]:
print(pipe.score(X_test, y_test))

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df = pd.read_csv('data/pokemon.csv', index_col=0)
poke_df = df.select_dtypes(include=numerics)

In [None]:
poke_df.head(3)

In [None]:
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
        		 ('reducer', PCA(n_components=2))])

In [None]:
pipe.fit(poke_df)
vectors = pipe.steps[1][1].components_.round(2)

In [None]:
# Print feature effects
print('PC 1 effects = ' + str(dict(zip(poke_df.columns, vectors[0]))))
print('PC 2 effects = ' + str(dict(zip(poke_df.columns, vectors[1]))))

In [None]:
poke_cat_df = df.select_dtypes(exclude=[np.number])
poke_cat_df.head(3)

In [None]:
pc = pipe.fit_transform(poke_df)

In [None]:
# Add the 2 components to poke_cat_df
poke_cat_df['PC 1'] = pc[:, 0]
poke_cat_df['PC 2'] = pc[:, 1]

In [None]:
sns.scatterplot(data=poke_cat_df, 
                x='PC 1', y='PC 2', hue='Type 1')
plt.show()

In [None]:
sns.scatterplot(data=poke_cat_df, 
                x='PC 1', y='PC 2', hue='Legendary')
plt.show()

In [None]:
df = pd.read_csv('data/pokemon.csv')
df.columns

In [None]:
X = df.select_dtypes(include=[np.number])
y = df['Legendary']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=2)),
        ('classifier', RandomForestClassifier(random_state=0))])

In [None]:
# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Prints the explained variance ratio
print(pipe.steps[1][1].explained_variance_ratio_)

In [None]:
# Score the accuracy on the test set
accuracy = pipe.score(X_test, y_test)

# Prints the model accuracy
print('{0:.1%} test set accuracy'.format(accuracy))

In [None]:
# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=3)),
        ('classifier', RandomForestClassifier(random_state=0))])

In [None]:
# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Prints the explained variance ratio
print(pipe.steps[1][1].explained_variance_ratio_)

In [None]:
# Score the accuracy on the test set
accuracy = pipe.score(X_test, y_test)

# Prints the model accuracy
print('{0:.1%} test set accuracy'.format(accuracy))

In [None]:
!jupyter nbconvert --to html 4_Feature_extraction.ipynb

In [None]:
!../gitbsh > /dev/null 2>&1