Importing all needed libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

PART A. Creating the pandas dataframe

In [2]:
df = pd.read_csv("Cancer.csv")
df.head()

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Malignant_Cancer
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


PART B. Spliting the data into training and testing sets

In [3]:
features = ["Clump_Thickness", "Uniformity_of_Cell_Size", "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size", "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses"]
X = df[features]
y = df["Malignant_Cancer"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)

PART C. Predict using Decision Tree Classifier and printing its accuracy

In [4]:
#creating the decision tree
my_DecisionTree = DecisionTreeClassifier(random_state=3)
#training the decision tree
my_DecisionTree.fit(X_train, y_train)
#predict y using decision tree
y_predict_dt = my_DecisionTree.predict(X_test)
#obtain score for decision tree
dt_score = accuracy_score(y_test, y_predict_dt)
print ("Accuracy Score: " + str(dt_score))

Accuracy Score: 0.8301886792452831


PART D. Use Bagging and Voting to make a prediction based on 19 decision trees

In [49]:
#Combining X_train and y_train because bootstrap_data will need the labels to train
training = pd.concat([X_train, y_train], axis=1)
prediction_results = [] #results of each prediction will go here

bootstrap_size = int(0.8 * X_train['Clump_Thickness'].count())


#for-loop to perform the bootstrapping
for i in range (0,19):
    bootstrap_data = resample(training, n_samples = bootstrap_size , random_state=i) #obtaining the bootstrap data
    #separating the features with their labels for the bootstrap_data
    bs_X = bootstrap_data[features]
    bs_y = bootstrap_data['Malignant_Cancer']
    #creating and traininf a decision tree using the bootstrap_data and predicting using those results
    base_DecisionTree = DecisionTreeClassifier(random_state=3)
    base_DecisionTree.fit(bs_X, bs_y)
    base_y_predict = base_DecisionTree.predict(X_test)
    prediction_results.append(base_y_predict) #appending each prediction to the list of predictions


Result of Voting: 0.9056603773584906


In [71]:
#Function to add the rows together
def getMaj(row):
    majority = 0
    for i in range(19):
        majority = majority + predict_df['Prediction_' + str(i)][row:row+1]
    return majority

In [85]:
#Creating a new data frame containing the results of each prediction
predict_df = pd.DataFrame()
col = 19
for i in range (len(prediction_results)):
    predict_df['Prediction_' + str(i) ] = prediction_results[i]

rows = predict_df['Prediction_0'].count()
col = len(predict_df.columns)

#List to keep track of the majority value of each row
majority_res = []

#Obtaining the majority for each row
for i in range(53):
    if int(getMaj(i)) > col/2:
        majority_res.append(1)
    else:
        majority_res.append(0)

#Adding the column in the prediction dataframe with all the majorities
predict_df['Majority'] = majority_res

#Calculating the score using the Majority obtained from voting
maj_score = accuracy_score(y_test, predict_df['Majority'])
print('Bagging/Voting Accuracy: ' + str(maj_score))

Accuracy through the use og Bagging/Voting: 0.9056603773584906


PART E. Use the Random Forest Classifier to make the prediction and print the accuracy

In [6]:
#the random forest classifier
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=3)
#Training and Predicting using Random Forest
my_RandomForest.fit(X_train, y_train)
y_predict_rf = my_RandomForest.predict(X_test)

#calculating accuracy score of random forest
rf_score = accuracy_score(y_test, y_predict_rf)

#print random forest score
print("Random Forest Score: " + str(rf_score))


Random Forest Score: 0.9245283018867925
