In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [3]:
scaler = StandardScaler()

In [None]:
X_train_std = scaler.fit_transform(X_train)

In [None]:
X_test_std = scaler.fit_transform(X_test)

In [5]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train_std,y_train)

In [None]:
y_pred = lr.predict(X_test_std,y_test)

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
print(lr.coef_)

In [None]:
print(dict(zip(X.columns, abs(lr.coef_[0]))))

In [None]:
X.drop('handlength',axis=1,inplace=True)

# Recursive Feature Elimination

In [6]:
from sklearn.feature_selection import RFE

In [7]:
rfe = RFE(estimator=LogisticRegression(), n_features_to_select= 2 , verbose=1)

In [None]:
rfe.fit_transform(X_train_std, y_train)

In [None]:
X.columns[rfe.support_]

In [None]:
print(dict(zip(X.columns[rfe.ranking_])))

In [None]:
print(accuracy_score(y_test,y_pred))

# dataset used Diabetes dataset

In [None]:
# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Fit the logistic regression model on the scaled training data
lr.fit(X_train_std, y_train)

# Scale the test features
X_test_std = scaler.transform(X_test)

# Predict diabetes presence on the scaled test set
y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

In [None]:
# Only keep the feature with the highest coefficient
X = diabetes_df[['glucose', 'triceps', 'bmi', 'family', 'age']]

# Performs a 25-75% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Scales features and fits the logistic regression model to the data
lr.fit(scaler.fit_transform(X_train), y_train)

# Calculates the accuracy on the test set and prints coefficients
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

In [None]:
# Create the RFE with a LogisticRegression estimator and 3 features to select
rfe = RFE(estimator=LogisticRegression(), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print("{0:.1%} accuracy on test set.".format(acc)) 

# Random Forest

In [1]:
# rf.feature_importances_
# used to find important features in Random Forest classifier

# mask = rf.feature_importances_  > 0.1
# print(mask)

# X_reduced = X.loc [ : , mask]  


""" to reduce feature one by one instead of all atonce use the below method """
from sklearn.feature_selection import RFE
rfe = RFE(estimator = RandomForestClassifier(), n_features_to_select= 6 ,step =10, verbose =1 )

# dropping like this needs high computation power
# use step parameter 

In [None]:
# Perform a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit the random forest model to the training data
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc))

In [None]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Prints out the mask
print(mask)

In [None]:
# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Apply the mask to the feature dataset X
reduced_X = X.loc[:,mask]

# prints out the selected column names
print(reduced_X.columns)

In [None]:
"""Create a recursive feature eliminator 
that will select the 2 most important features using a random forest model.

ou'll need these pre-loaded datasets: X, X_train, y_train.

Functions and classes that have been pre-loaded for you are: 
RandomForestClassifier(), RFE(), train_test_split()
"""

# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

In [None]:
# Wrap the feature eliminator around the random forest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

In [None]:
# Set the feature eliminator to remove 2 features on each step
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step=2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

# Playing with Linear Regression Features


In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,y_train)

lr.coef_ # use to see what features have an significant effect on our model

lr.intercept_ # to check intercept value use by model

lr.score(X_test, y_test) #check model score / R-square value



In [None]:
from sklearn.linear_model import Lasso

la = Lasso()
la.fit(X_train,y_train)

la.coef_ # use to see what features have an significant effect on our model

la.intercept_ # to check intercept value use by model

la.score(X_test, y_test)

In [None]:
la = Lasso(alpha = 0.05) # use alpha for regularization
la.fit(X_train,y_train)

la.coef_ # use to see what features have an significant effect on our model

la.intercept_ # to check intercept value use by model

la.score(X_test, y_test)

In [None]:
"""
Set the test size to 30% to get a 70-30% train test split.
Fit the scaler on the training features and transform these in one go.
Create the Lasso model.
Fit it to the scaled training data.


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

"""

# Set the test size to 30% to get a 70-30% train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Fit the scaler on the training features and transform these in one go
X_train_std = scaler.fit_transform(X_train)

# Create the Lasso model
la = Lasso()

# Fit it to the standardized training data
la.fit(X_train_std,y_train)


In [None]:
"""
Transform the test set with the pre-fitted scaler.
Calculate the R2 value on the scaled test data.
Create a list that has True values when coefficients equal 0.
Calculate the total number of features with a coefficient of 0.

The model can predict 84.7% of the variance in the test set.
The model has ignored 82 out of 91 features.

"""

# Transform the test set with the pre-fitted scaler
X_test_std = scaler.transform(X_test)

# Calculate the coefficient of determination (R squared) on X_test_std
r_squared = la.score(X_test_std, y_test)
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))

# Create a list that has True values when coefficients equal 0
zero_coef = la.coef_ == 0

# Calculate how many features have a zero coefficient
n_ignored = sum(zero_coef)
print("The model has ignored {} out of {} features.".format(n_ignored, len(la.coef_)))

In [None]:
# Find the highest alpha value with R-squared above 98%
la = Lasso(alpha = 0.01, random_state=0)

# Fits the model and calculates performance stats
la.fit(X_train_std, y_train)
r_squared = la.score(X_test_std, y_test)
n_ignored_features = sum(la.coef_ == 0)

# Print peformance stats 
print("The model can predict {0:.1%} of the variance in the test set.".format(r_squared))
print("{} out of {} features were ignored.".format(n_ignored_features, len(la.coef_)))

In [None]:
from sklearn.linear_model import LassoCV
lcv = LassoCV()
lcv.fit(X_train, y_train)

print(lcv.alpha_)

mask = lcv.coef_ !=0

print(mask)

In [2]:
import numpy as np
votes = np.sum([lcv_mask,rf_mask,gb_mask],axis=0)
print(votes)

# if u are conserve and want to use as much as max features , use features with minimum 1 vote

mask = model_votes >=2

reduced_X = X.loc[:,mask]

NameError: name 'lcv_mask' is not defined

In [None]:

"""You'll be predicting biceps circumference on a subsample of the male ANSUR dataset
using the LassoCV() regressor that automatically tunes the regularization strength (alpha value) 
using Cross-Validation.

The standardized training and test data has been pre-loaded for you as 
X_train, X_test, y_train, and y_test

Create and fit the LassoCV model on the training set.
Calculate R2 on the test set.
Create a mask for coefficients not equal to zero.

"""

from sklearn.linear_model import LassoCV

# Create and fit the LassoCV model on the training set
lcv = LassoCV()
lcv.fit(X_train,y_train)
print('Optimal alpha = {0:.3f}'.format(lcv.alpha_))

# Calculate R squared on the test set
r_squared = lcv.score(X_test,y_test)
print('The model explains {0:.1%} of the test set variance'.format(r_squared))

# Create a mask for coefficients not equal to zero
lcv_mask = lcv.coef_ !=0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))




In [None]:
"""
The LassoCV() model selected 26 out of 32 features. 
Not bad, but not a spectacular dimensionality reduction either. 
Let's use two more models to select the 10 features 
they consider most important using the Recursive Feature Eliminator (RFE).

Select 10 features with RFE on a GradientBoostingRegressor 
and drop 3 features on each step.


"""
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor

# Select 10 features with RFE on a GradientBoostingRegressor, drop 3 features on each step
rfe_gb = RFE(estimator=GradientBoostingRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_gb.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_gb.score(X_test,y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
gb_mask = rfe_gb.support_

In [None]:
"""
The LassoCV() model selected 26 out of 32 features. 
Not bad, but not a spectacular dimensionality reduction either. 
Let's use two more models to select the 10 features 
they consider most important using the Recursive Feature Eliminator (RFE).




"""

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

# Select 10 features with RFE on a RandomForestRegressor, drop 3 features on each step
rfe_rf = RFE(estimator=RandomForestRegressor(), 
             n_features_to_select=10, step=3, verbose=1)
rfe_rf.fit(X_train, y_train)

# Calculate the R squared on the test set
r_squared = rfe_rf.score(X_test, y_test)
print('The model can explain {0:.1%} of the variance in the test set'.format(r_squared))

# Assign the support array to gb_mask
rf_mask = rfe_rf.support_

In [None]:
"""
We'll combine the votes of the 3 models you built in the previous exercises,
to decide which features are important into a meta mask.
We'll then use this mask to reduce dimensionality and see how a simple linear regressor
performs on the reduced dataset.

The per model votes have been pre-loaded as lcv_mask, rf_mask, and gb_mask 
and the feature and target datasets as X and y


"""


# Sum the votes of the three models
import numpy as np
votes = np.sum([lcv_mask,rf_mask,gb_mask],axis=1)
print(votes)

In [None]:
# Sum the votes of the three models
votes = np.sum([lcv_mask, rf_mask, gb_mask], axis=0)

# Create a mask for features selected by all 3 models
meta_mask = votes >= 3

# Apply the dimensionality reduction on X
X_reduced = X.loc[:, meta_mask]

# Plug the reduced dataset into a linear regression pipeline
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=0)
lm.fit(scaler.fit_transform(X_train), y_train)
r_squared = lm.score(scaler.transform(X_test), y_test)
print('The model can explain {0:.1%} of the variance in the test set using {1:} features.'.format(r_squared, len(lm.coef_)))