In [None]:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics #import plot_roc_curve



In [None]:

df = pd.read_csv("UCI_Breast_cancer.csv")
print(df.info())



In [None]:

#Print the first 10 rows of the data
print(df.head(10))



In [None]:

#Print the first 10 rows of the data
print(df.tail(10))




<p>Do descriptive statistics on the data</p>


In [None]:

df.describe()



In [None]:

# Plot formatting

def plt_format():
    %matplotlib inline
    plt.rc('font', family='DejaVu Sans')
    plt.figure(figsize=(16,14))
    plt.rcParams['xtick.labelsize'] = 16
    plt.rcParams['ytick.labelsize'] = 16
    plt.rcParams['font.size'] = 16
    sns.set(style="ticks", color_codes=True)
    plt.rcParams['axes.labelcolor'] = 'black'
    plt.rcParams['axes.labelsize'] = 16
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 32
    plt.rcParams['axes.titleweight'] = 'bold'
    plt.rcParams['text.color'] = 'black'
    plt.rcParams['xtick.labelsize'] = 16
    plt.rcParams['ytick.labelsize'] = 16
    plt.rcParams['legend.frameon'] = False
    plt.rcParams['axes.linewidth'] = 1
    
plt_format()



In [None]:

# Check correlation among the variables.
# Pearson correlation

plt_format()
ax = plt.axes()
sns.heatmap(df[df.columns[0:10]].corr(), annot=True)
ax.set_title('Heatmap of Pearson Correlation\n')
plt.axis('tight')
plt.show()



In [None]:

# Set the target variable which in this case "Classification" variable
# or y = df.Classification.values
# or y = dataset.iloc[:,len(dataset.iloc[0])-1].values
y = df['Classification']  # Labels

# Take the values of all rows for the first 9 variables as input 
X = df.iloc[:,0:9].values



In [None]:

#Split dataset into training set and test set (e.g., 70% training, 30% test data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [None]:

df.columns



In [None]:

# Get a series containing counts of unique values of variable 'Classification'
# Use normalize=true to get the relative frequencies of the unique values
df['Classification'].value_counts(normalize=True)




<p>This means that the data consists of 55% of class 2 and 45% of class 1.</p>


In [None]:

# Create RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 100, random_state=0)

# Could use the best model from the cross-validation experiment below.
# rf_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=None, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
#            oob_score=True, random_state=0, verbose=0, warm_start=False)

# Fit/train a model using training set
rf_classifier.fit(X_train, y_train)



In [None]:

# Score it on your testing data.
# rf_classifier.score(X_test, y_test)

# Or nicer:
print("Accuracy on the Test data:", rf_classifier.score(X_test, y_test))



In [None]:

y_pred = rf_classifier.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))



In [None]:

importances = rf_classifier.feature_importances_

std = np.std([tree.feature_importances_ for tree in rf_classifier.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()




<p>Let's experiment with some parameters of random forest.
We will try different values for n_estimator (number of trees).</p>


In [None]:

param_to_test = {'n_estimators': [100, 500, 1000, 2000]}

# for max_features:
# If “auto”, then max_features=sqrt(n_features).
# If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
# If “log2”, then max_features=log2(n_features).
# If None, then max_features=n_features. 
# param_to_test = {'n_estimators': [100, 500, 1000, 2000], 
#                 'max_features': ['auto', 'sqrt', 'log2', 'None']}

# Use out-of-bag samples to estimate the generalization accuracy.
rf_classifier2 = RandomForestClassifier(oob_score=True, random_state=0)



In [None]:

# Experiments with n-fold cross validation
from sklearn.model_selection import GridSearchCV

# 10-fold cross validation, and return training score
grid10 = GridSearchCV(rf_classifier2, param_grid = param_to_test, cv = 10, scoring ='accuracy', return_train_score = True)

# Do the training
grid10.fit(X_train,y_train)
grid10.cv_results_



In [None]:

# Find the best model and the score from the cross validation experiment

print('The best model:', grid10.best_estimator_)
print('The best score: ', grid10.best_score_)




<p>Let's do experiment to know the number of features to be used.</p>


In [None]:

# To use Stratified K-Folds cross-validator
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html 
from sklearn.model_selection import StratifiedKFold

# To use recursive feature eliminatin with cross-validation (RCECV)
# https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py
from sklearn.feature_selection import RFECV

# Use the best model from 10-fold cross validation experiment
# Copy from the result above
model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

# Create the RFE object and compute a cross-validated score.
rfecv = RFECV(model, step=1, cv=10)
# fit = rfecv.fit(X_train, y_train)

# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(n_splits=10),
              scoring='accuracy')
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

