In [1]:
# Feature Extraction is the process to combine two three or more features to one features such that it preserves the 
# quality of all the features in 1 and nothing or less information is lost

# ex: 
# df['BMI'] = df['weight'] / df['height'] ** 2
# df.drop(['weight','height'],axis =1)

# PCA

##  Manual feature extraction I

In [None]:
"""You want to compare prices for specific products between stores. 
The features in the pre-loaded dataset sales_df are: storeID, product, quantity and revenue. 
The quantity and revenue features tell you 
how many items of a particular product were sold in a store and what the total revenue was.

Calculate the product price from the quantity sold and total revenue.
Drop the quantity and revenue features from the dataset.

"""

# Calculate the price from the quantity sold and revenue
sales_df['price'] = sales_df['revenue'] / sales_df['quantity'] 

# Drop the quantity and revenue features
reduced_df = sales_df.drop(['quantity','revenue'], axis=1)

print(reduced_df.head())

## Manual feature extraction II

In [None]:
"""

You're working on a variant of the ANSUR dataset, height_df, where a person's height was measured 3 times. 
Add a feature with the mean height to the dataset and then drop the 3 original features.

"""
# Calculate the mean height
height_df['height'] = height_df[["height_1","height_2","height_3"]].mean(axis=1)

# Drop the 3 original height features
reduced_df = height_df.drop(['height_1','height_2','height_3'], axis=1)

print(reduced_df.head())


In [None]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(std_df)
print(pca.explained_variance_ratio_)

# can also use numpy cumsum() to verify cummulative sum



In [None]:
# Create a pairplot to inspect ansur_df
sns.pairplot(ansur_df)

plt.show()

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create the scaler
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Create the PCA instance and fit and transform the data with pca
pca = PCA()
pc = pca.fit_transform(ansur_std)
pc_df = pd.DataFrame(pc, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4'])

# Create a pairplot of the principal component dataframe
sns.pairplot(pc_df)
plt.show()

In [None]:
# PCA on a larger dataset

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Scale the data
scaler = StandardScaler()
ansur_std = scaler.fit_transform(ansur_df)

# Apply PCA
pca = PCA()
pca.fit(ansur_std)

# Inspect the explained variance ratio per component
print(pca.explained_variance_ratio_)

# Print the cumulative sum of the explained variance ratio
print(pca.explained_variance_ratio_.cumsum())

print(pca.components_)

In [None]:
# Pipeline




# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
        		 ('reducer', PCA(n_components=2))])

# Fit it to the dataset and extract the component vectors
pipe.fit(poke_df)
vectors = pipe.steps[1][1].components_.round(2)

# Print feature effects
print('PC 1 effects = ' + str(dict(zip(poke_df.columns, vectors[0]))))
print('PC 2 effects = ' + str(dict(zip(poke_df.columns, vectors[1]))))


"""

pipe.steps[1][1]
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
  

PC 1 effects = {'Sp. Def': 0.45, 'HP': 0.39, 'Attack': 0.44, 'Sp. Atk': 0.46, 'Speed': 0.34, 'Defense': 0.36}
PC 2 effects = {'Sp. Def': 0.24, 'HP': 0.08, 'Attack': -0.01, 'Sp. Atk': -0.31, 'Speed': -0.67, 'Defense': 0.63}


"""

In [None]:
# Build the pipeline
pipe = Pipeline([('scaler', StandardScaler()),
                 ('reducer', PCA(n_components=2))])

# Fit the pipeline to poke_df and transform the data
pc = pipe.fit_transform(poke_df)

print(pc)


# Add the 2 components to poke_cat_df
poke_cat_df['PC 1'] = pc[:,0]
poke_cat_df['PC 2'] = pc[:,1]


# Use the Type feature to color the PC 1 vs PC 2 scatterplot
sns.scatterplot(data=poke_cat_df, 
                x='PC 1', y='PC 2', hue='Type')
plt.show()



"""
Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('reducer', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False))])

"""



In [None]:
# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=____)),
        ('classifier', RandomForestClassifier(random_state=0))])

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Score the accuracy on the test set
accuracy = pipe.score(X_test, y_test)

# Prints the explained variance ratio and accuracy
print(pipe.steps[1][1].explained_variance_ratio_)
print('{0:.1%} test set accuracy'.format(accuracy))

In [None]:
# Build the pipeline
pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('reducer', PCA(n_components=____)),
        ('classifier', RandomForestClassifier(random_state=0))])

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

var = pipe.steps[1][1].explained_variance_ratio_

plt.plot(var)

plt.xlabel("Principal component Index")
plt.ylabel("Explained Variance Ratio")
plt.show()

In [None]:
# Let PCA select 90% of the variance
pipe = Pipeline([('scaler', StandardScaler()),
        		 ('reducer', PCA(n_components=0.9))])

# Fit the pipe to the data
pipe.fit(ansur_df)

print('{} components selected'.format(len(pipe.steps[1][1].components_)))

In [None]:
# Pipeline a scaler and pca selecting 10 components
pipe = Pipeline([('scaler', StandardScaler()),
        		 ('reducer', PCA(n_components=10))])

# Fit the pipe to the data
pipe.fit(ansur_df)

# Plot the explained variance ratio
plt.plot(pipe.steps[1][1].explained_variance_ratio_)

plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
# Transform the input data to principal components
pc = pipe.transform(X_test)

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Prints the number of features
print("X_rebuilt has {} features".format(X_rebuilt.shape[1]))

In [None]:
# Transform the input data to principal components
pc = pipe.transform(X_test)

# Inverse transform the components to original feature space
X_rebuilt = pipe.inverse_transform(pc)

# Plot the reconstructed data
plot_digits(X_rebuilt)