In [10]:
# Import required libraries
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
# Open dataset
data = pd.read_csv('../data/data_train_scaled.csv')
data.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'radius_mean', u'texture_mean',
       u'perimeter_mean', u'area_mean', u'smoothness_mean',
       u'compactness_mean', u'concavity_mean', u'concave_points_mean',
       u'symmetry_mean', u'fractal_dimension_mean', u'radius_se',
       u'texture_se', u'perimeter_se', u'area_se', u'smoothness_se',
       u'compactness_se', u'concavity_se', u'concave_points_se',
       u'symmetry_se', u'fractal_dimension_se', u'radius_worst',
       u'texture_worst', u'perimeter_worst', u'area_worst',
       u'smoothness_worst', u'compactness_worst', u'concavity_worst',
       u'concave_points_worst', u'symmetry_worst', u'fractal_dimension_worst',
       u'diagnosis', u'id'],
      dtype='object')

In [4]:
data.shape

(455, 34)

In [5]:
# Split data into features and target
X_train = data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'diagnosis', 'id'], axis=1)
y_train = data['diagnosis']
X_train.shape

(455, 30)

In [13]:
'''
Search for the best parameters (max_features = # features to consider at each branch, n_estimators = # trees in the forest) 
using GridsearchCV
'''

parameters = {'max_features':range(3,30), 'n_estimators':range(5,20)}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=4)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_) 

(0.97142857142857142, {'max_features': 8, 'n_estimators': 13})


In [14]:
# Create forest with the best parameters
forest = RandomForestClassifier(max_features=8, n_estimators=13, criterion='entropy')
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=8, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=13, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [21]:
# Open testing dataset
data_test = pd.read_csv('../data/data_test_scaled.csv')
data_test.columns

Index([u'radius_mean', u'texture_mean', u'perimeter_mean', u'area_mean',
       u'smoothness_mean', u'compactness_mean', u'concavity_mean',
       u'concave_points_mean', u'symmetry_mean', u'fractal_dimension_mean',
       u'radius_se', u'texture_se', u'perimeter_se', u'area_se',
       u'smoothness_se', u'compactness_se', u'concavity_se',
       u'concave_points_se', u'symmetry_se', u'fractal_dimension_se',
       u'radius_worst', u'texture_worst', u'perimeter_worst', u'area_worst',
       u'smoothness_worst', u'compactness_worst', u'concavity_worst',
       u'concave_points_worst', u'symmetry_worst', u'fractal_dimension_worst',
       u'diagnosis'],
      dtype='object')

In [22]:
data_test.shape

(114, 31)

In [24]:
# Split testing data into features and target
X_test = data_test.drop(['diagnosis'], axis=1)
y_test = data_test['diagnosis']
X_test.shape

(114, 30)

In [25]:
# Test the accuracy of the random forest model on the test set.
y_predicted = forest.predict(X_test)
accuracy_score(y_test, y_predicted)

0.95614035087719296