In [1]:
from sklearn.datasets import load_breast_cancer # The data set used
from sklearn.preprocessing import StandardScaler # Normalizes inputs
from sklearn.preprocessing import PolynomialFeatures # Combine features of observable domain and construct new domain
from sklearn.ensemble import RandomForestClassifier # Creates an aggregation of decision trees
from sklearn.feature_selection import RFECV # Recursive Feature Elimination with Cross Validation (eliminate feature noise)
from sklearn.model_selection import train_test_split # Splits test data and training data to avoid overfitting

In [2]:
cancer = load_breast_cancer() # Load the data set
x_scaled = StandardScaler().fit_transform(cancer.data) # Re-centers and re-scales values

print("Original data (rows, features):", x_scaled.shape)

Original data (rows, features): (569, 30)


In [3]:
%%time
poly = PolynomialFeatures(2) # Take 30 original features and create synthetic features
x_poly = poly.fit_transform(x_scaled)

print("All polynomial featues (order 2):", x_poly.shape)

All polynomial featues (order 2): (569, 496)
Wall time: 6.95 ms


In [4]:
%%time
# Create a Random Forest with a maximum depth of 7, 10 trees, 1 random state for replication
rfc = RandomForestClassifier(max_depth = 7, n_estimators = 10, random_state = 1) 

# Create a random forest classifier using the rfc constructed earlier, split into 5 pieces to compare
rfecv = RFECV(estimator = rfc, cv = 5, n_jobs = -1) 
x_poly_top = rfecv.fit_transform(x_poly, cancer.target) # get top features

print("Best polynomial features: ", x_poly_top.shape)

Best polynomial features:  (569, 278)
Wall time: 34.8 s


In [5]:
%%time
# Split training and testing data from the trimmed dataset with a random_state of 42 for reproducibility
x_train, x_test, y_train, y_test = train_test_split(x_poly_top, cancer.target, random_state = 42)

# Create a Random Forest with a maximum depth of 7, 10 trees, 1 random state for replication
rfc = RandomForestClassifier(max_depth = 7, n_estimators = 10, random_state = 1)

# Train the model using training data and score it's accuracy using the test data
acc = rfc.fit(x_train, y_train).score(x_test, y_test)

print("Test accuracy: {:.0f}%".format(100. * acc))

Test accuracy: 94%
Wall time: 29.9 ms
