In [1]:
# run this to shorten the data import from the files
path_data = '/home/nero/Documents/Estudos/DataCamp/Python/Dimensionality_Reduction_in_Python/datasets/'
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [17]:
data = pd.read_csv(path_data + 'PimaIndians.csv')

X = data.drop('test', axis = 1)
y = data['test']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [20]:
scaler = StandardScaler()
lr = LogisticRegression()

In [21]:
# exercise 01

"""
Building a diabetes classifier

You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test.

A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.
"""

# Instructions

"""

    Fit the scaler on the training features and transform these features in one go.
    Fit the logistic regression model on the scaled training data.
    Scale the test features.
    Predict diabetes presence on the scaled test set.

"""

# solution

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print(f"{accuracy_score(y_test, y_pred):.1%} accuracy on test set.")
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

#----------------------------------#

# Conclusion

"""
Great! We get almost 80% accuracy on the test set. Take a look at the differences in model coefficients for the different features.
"""

78.6% accuracy on test set.
{'pregnant': 0.23, 'glucose': 1.22, 'diastolic': 0.06, 'triceps': 0.2, 'insulin': 0.24, 'bmi': 0.48, 'family': 0.45, 'age': 0.43}


'\nGreat! We get almost 80% accuracy on the test set. Take a look at the differences in model coefficients for the different features.\n'

In [22]:
# exercise 02

"""
Manual Recursive Feature Elimination

Now that we've created a diabetes classifier, let's see if we can reduce the number of features without hurting the model accuracy too much.

On the second line of code the features are selected from the original DataFrame. Adjust this selection.

A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.

All necessary functions and packages have been pre-loaded too.
"""

# Instructions

"""

    First, run the given code, then remove the feature with the lowest model coefficient from X.

    Run the code and remove 2 more features with the lowest model coefficients.

    Run the code and only keep the feature with the highest coefficient.

"""

# solution

# Remove the feature with the lowest model coefficient
X = data[['pregnant', 'glucose', 'triceps', 'insulin', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.") 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

#----------------------------------#

# Remove the 2 features with the lowest model coefficients
X = data[['glucose', 'triceps', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.") 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

#----------------------------------#

# Only keep the feature with the highest coefficient
X = data[['glucose']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model to the data
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print(f"{acc:.1%} accuracy on test set.")  
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

#----------------------------------#

# Conclusion

"""
Interesting! Removing all but one feature only reduced the accuracy by a few percent.
"""

80.6% accuracy on test set.
{'pregnant': 0.05, 'glucose': 1.24, 'triceps': 0.24, 'insulin': 0.2, 'bmi': 0.39, 'family': 0.34, 'age': 0.35}
79.6% accuracy on test set.
{'glucose': 1.13, 'triceps': 0.25, 'bmi': 0.34, 'family': 0.34, 'age': 0.37}
75.5% accuracy on test set.
{'glucose': 1.28}


'\nInteresting! Removing all but one feature only reduced the accuracy by a few percent.\n'

In [23]:
from sklearn.feature_selection import RFE

In [25]:
data = pd.read_csv(path_data + 'PimaIndians.csv')

X = data.drop('test', axis = 1)
y = data['test']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [27]:
# exercise 03

"""
Automatic Recursive Feature Elimination

Now let's automate this recursive process. Wrap a Recursive Feature Eliminator (RFE) around our logistic regression estimator and pass it the desired number of features.

All the necessary functions and packages have been pre-loaded and the features have been scaled for you.
"""

# Instructions

"""

    Create the RFE with a LogisticRegression() estimator and 3 features to select.
    Print the features and their ranking.
    Print the features that are not eliminated.

"""

# solution

# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(max_iter = 999), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print(f"{acc:.1%} accuracy on test set.") 

#----------------------------------#

# Conclusion

"""
Great! When we eliminate all but the 3 most relevant features we get a 80.6% accuracy on the test set.
"""

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 2, 'glucose': 3, 'diastolic': 5, 'triceps': 4, 'insulin': 6, 'bmi': 1, 'family': 1, 'age': 1}
Index(['bmi', 'family', 'age'], dtype='object')
71.4% accuracy on test set.


'\nGreat! When we eliminate all but the 3 most relevant features we get a 80.6% accuracy on the test set.\n'

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
# exercise 04

"""
Building a random forest model

You'll again work on the Pima Indians dataset to predict whether an individual has diabetes. This time using a random forest classifier. You'll fit the model on the training data after performing the train-test split and consult the feature importance values.

The feature and target datasets have been pre-loaded for you as X and y. Same goes for the necessary packages and functions.
"""

# Instructions

"""

    Set a 25% test size to perform a 75%-25% train-test split.
    Fit the random forest classifier to the training data.
    Calculate the accuracy on the test set.
    Print the feature importances per feature.

"""

# solution

# Perform a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print(f"{acc:.1%} accuracy on test set.") 

#----------------------------------#

# Conclusion

"""
Good job! The random forest model gets 78% accuracy on the test set and 'glucose' is the most important feature (0.21).
"""

{'pregnant': 0.07, 'glucose': 0.25, 'diastolic': 0.09, 'triceps': 0.09, 'insulin': 0.14, 'bmi': 0.12, 'family': 0.12, 'age': 0.13}
79.6% accuracy on test set.


"\nGood job! The random forest model gets 78% accuracy on the test set and 'glucose' is the most important feature (0.21).\n"

In [34]:
# exercise 05

"""
Random forest for feature selection

Now lets use the fitted random model to select the most important features from our input dataset X.

The trained model from the previous exercise has been pre-loaded for you as rf.
"""

# Instructions

"""

    Create a mask for features with an importance higher than 0.15.
---

    Sub-select the most important features by applying the mask to X.

"""

# solution

# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.12

# Apply the mask to the feature dataset X
reduced_X = X.loc[:, mask]

# prints out the selected column names
print(reduced_X.columns)

#----------------------------------#

# Conclusion

"""
Well done! Only the features 'glucose' and 'age' were considered sufficiently important.
"""

Index(['glucose', 'insulin', 'age'], dtype='object')


"\nWell done! Only the features 'glucose' and 'age' were considered sufficiently important.\n"

In [36]:
# exercise 06

"""
Recursive Feature Elimination with random forests

You'll wrap a Recursive Feature Eliminator around a random forest model to remove features step by step. This method is more conservative compared to selecting features after applying a single importance threshold. Since dropping one feature can influence the relative importances of the others.

You'll need these pre-loaded datasets: X, X_train, y_train.

Functions and classes that have been pre-loaded for you are: RandomForestClassifier(), RFE(), train_test_split().
"""

# Instructions

"""

    Create a recursive feature eliminator that will select the 2 most important features using a random forest model.
---

    Fit the recursive feature eliminator to the training data.
---

    Create a mask using the fitted eliminator's support_ attribute, then apply it to the feature dataset X.
---

    Change the settings of RFE() to eliminate 2 features at each step.

"""

# solution

# Set the feature eliminator to remove 2 features on each step
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

#----------------------------------#

# Conclusion

"""
Great! Compared to the quick and dirty single threshold method from the previous exercise one of the selected features is different.
"""

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 4 features.
Index(['glucose', 'bmi'], dtype='object')


'\nGreat! Compared to the quick and dirty single threshold method from the previous exercise one of the selected features is different.\n'

In [39]:
ansur_df_1 = pd.read_csv(path_data + 'ANSUR_II_MALE.csv')
ansur_df_2 = pd.read_csv(path_data + 'ANSUR_II_FEMALE.csv')

ansur_df = pd.concat([ansur_df_1, ansur_df_2])
# Non-numerical columns in the dataset
non_numeric = ['Branch', 'Component', 'BMI_class', 'Height_class', 'Gender']

# Drop the non-numerical columns from df
ansur_df = ansur_df.drop(non_numeric, axis=1)

In [41]:
X = ansur_df.drop('BMI', axis = 1)
y = ansur_df['BMI']

In [42]:
from sklearn.linear_model import Lasso

In [43]:
# exercise 07

"""
Creating a LASSO regressor

You'll be working on the numeric ANSUR body measurements dataset to predict a persons Body Mass Index (BMI) using the pre-imported Lasso() regressor. BMI is a metric derived from body height and weight but those two features have been removed from the dataset to give the model a challenge.

You'll standardize the data first using the StandardScaler() that has been instantiated for you as scaler to make sure all coefficients face a comparable regularizing force trying to bring them down.

All necessary functions and classes plus the input datasets X and y have been pre-loaded.
"""

# Instructions

"""

    Set the test size to 30% to get a 70-30% train test split.
    Fit the scaler on the training features and transform these in one go.
    Create the Lasso model.
    Fit it to the scaled training data.

"""

# solution

# Set the test size to 30% to get a 70-30% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std, y_train)

#----------------------------------#

# Conclusion

"""
Good job! You've fitted the Lasso model to the standardized training data. Now let's look at the results!
"""

"\nGood job! You've fitted the Lasso model to the standardized training data. Now let's look at the results!\n"

In [44]:
# exercise 08

"""
Lasso model results

Now that you've trained the Lasso model, you'll score its predictive capacity (

) on the test set and count how many features are ignored because their coefficient is reduced to zero.

The X_test and y_test datasets have been pre-loaded for you.

The Lasso() model and StandardScaler() have been instantiated as la and scaler respectively and both were fitted to the training data.
"""

# Instructions

"""
Transform the test set with the pre-fitted scaler.
Calculate the R²
value on the scaled test data.
Create a list that has True values when coefficients equal 0.
Calculate the total number of features with a coefficient of 0.
"""

# solution

# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std,y_test)
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print(f"The model has ignored {n_ignored} out of {len(la.coef_)} features.")

#----------------------------------#

# Conclusion

"""
Good! We can predict almost 85% of the variance in the BMI value using just 9 out of 91 of the features. The R^2 could be higher though.
"""

The model can predict 82.9% of the variance in the test set.
The model has ignored 83 out of 93 features.


'\nGood! We can predict almost 85% of the variance in the BMI value using just 9 out of 91 of the features. The R^2 could be higher though.\n'

In [45]:
# exercise 09

"""
Adjusting the regularization strength

Your current Lasso model has an R² score of 84.7%. When a model applies overly powerful regularization it can suffer from high bias, hurting its predictive power.

Let's improve the balance between predictive power and model simplicity by tweaking the alpha parameter.
"""

# Instructions

"""
Find the highest value for alpha that gives an R² value above 98% from the options: 1, 0.5, 0.1, and 0.01.
"""

# solution

# Find the highest alpha value with R-squared above 98%
la = Lasso(alpha=0.1, random_state=0)

# Fits the model and calculates performance stats
la.fit(X_train_std, y_train)
r_squared = la.score(X_test_std, y_test)
n_ignored_features = sum(la.coef_ == 0)

# Print peformance stats 
print(f"The model can predict {r_squared:.1%} of the variance in the test set.")
print(f"{n_ignored_features} out of {len(la.coef_)} features were ignored.")

#----------------------------------#

# Conclusion

"""
Wow! With this more appropriate regularization strength we can predict 98% of the variance in the BMI value while ignoring 2/3 of the features.
"""

The model can predict 98.5% of the variance in the test set.
69 out of 93 features were ignored.


'\nWow! With this more appropriate regularization strength we can predict 98% of the variance in the BMI value while ignoring 2/3 of the features.\n'

In [47]:
# exercise 10

"""
Creating a LassoCV regressor

You'll be predicting biceps circumference on a subsample of the male ANSUR dataset using the LassoCV() regressor that automatically tunes the regularization strength (alpha value) using Cross-Validation.

The standardized training and test data has been pre-loaded for you as X_train, X_test, y_train, and y_test.
"""

# Instructions

"""
Create and fit the LassoCV model on the training set.
Calculate R² on the test set.
Create a mask for coefficients not equal to zero.
"""

# solution

from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV(max_iter = 2000)
lcv.fit(X_train, y_train)
print(f'Optimal alpha = {lcv.alpha_:.3f}')

# Calculate R squared on the test set
r_squared = lcv.score(X_test, y_test)
print(f'The model explains {r_squared:.1%} of the test set variance')

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ != 0
print(f'{sum(lcv_mask)} features out of {len(lcv_mask)} selected')

#----------------------------------#

# Conclusion

"""
Great! We got a decent R squared and removed 10 features. We'll save the lcv_mask for later on.
"""

Optimal alpha = 0.406
The model explains 98.9% of the test set variance
38 features out of 93 selected


"\nGreat! We got a decent R squared and removed 10 features. We'll save the lcv_mask for later on.\n"

In [48]:
# exercise 11

"""
Ensemble models for extra votes

The LassoCV() model selected 22 out of 32 features. Not bad, but not a spectacular dimensionality reduction either. Let's use two more models to select the 10 features they consider most important using the Recursive Feature Eliminator (RFE).

The standardized training and test data has been pre-loaded for you as X_train, X_test, y_train, and y_test.
"""

# Instructions

"""

    Select 10 features with RFE on a GradientBoostingRegressor and drop 3 features on each step.
---
    Calculate the R² on the test set.
---

    Assign the support array of the fitted model to gb_mask.
---

    Modify the first step to select 10 features with RFE on a RandomForestRegressor() and drop 3 features on each step.

"""

# solution

from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=10, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_

#----------------------------------#

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=10, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set')

# Assign the support array to rf_mask
rf_mask = rfe_rf.support_

#----------------------------------#

# Conclusion

"""
Good job! Including the Lasso linear model from the previous exercise, we now have the votes from 3 models on which features are important.
"""

Fitting estimator with 93 features.
Fitting estimator with 90 features.
Fitting estimator with 87 features.
Fitting estimator with 84 features.
Fitting estimator with 81 features.
Fitting estimator with 78 features.
Fitting estimator with 75 features.
Fitting estimator with 72 features.
Fitting estimator with 69 features.
Fitting estimator with 66 features.
Fitting estimator with 63 features.
Fitting estimator with 60 features.
Fitting estimator with 57 features.
Fitting estimator with 54 features.
Fitting estimator with 51 features.
Fitting estimator with 48 features.
Fitting estimator with 45 features.
Fitting estimator with 42 features.
Fitting estimator with 39 features.
Fitting estimator with 36 features.
Fitting estimator with 33 features.
Fitting estimator with 30 features.
Fitting estimator with 27 features.
Fitting estimator with 24 features.
Fitting estimator with 21 features.
Fitting estimator with 18 features.
Fitting estimator with 15 features.
Fitting estimator with 12 fe

'\nGood job! Including the Lasso linear model from the previous exercise, we now have the votes from 3 models on which features are important.\n'

In [50]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [51]:
# exercise 12

"""
Combining 3 feature selectors

We'll combine the votes of the 3 models you built in the previous exercises, to decide which features are important into a meta mask. We'll then use this mask to reduce dimensionality and see how a simple linear regressor performs on the reduced dataset.

The per model votes have been pre-loaded as lcv_mask, rf_mask, and gb_mask and the feature and target datasets as X and y.
"""

# Instructions

"""

    Sum the votes of the three models using np.sum().
---

    Create a mask for features selected by all 3 models.
---

    Apply the dimensionality reduction on X and print which features were selected.
---

    Plug the reduced dataset into the code for simple linear regression that has been written for you.

"""

# solution

# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes == 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print(f'The model can explain {r_squared:.1%} of the variance in the test set using {len(lm.coef_)} features.')

#----------------------------------#

# Conclusion

"""
Awesome! Using the votes from 3 models you were able to select just 7 features that allowed a simple linear model to get a high accuracy!
"""

The model can explain 97.1% of the variance in the test set using 7 features.


'\nAwesome! Using the votes from 3 models you were able to select just 7 features that allowed a simple linear model to get a high accuracy!\n'