import libraries

In [1]:
from sklearn.ensemble import RandomForestClassifier #our ensemble classifier
from sklearn.model_selection import train_test_split #to split dataset randomly
from sklearn import metrics #to check performance of model
from statistics import median,mode #median and mode as functions
import pandas as pd

Load dataset

In [2]:
data=pd.read_csv("wdbc.data",header=None) #read the dataset
data.head() #print first 5 rows to check which column contain label eg: here col 1 is label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


Data description and filling missing values

In [3]:
print("Dataset description")
data.info() #prints type,number of missing values,total rows in each columns
for col in data.columns:
  if data[col].dtype=='float64' or data[col].dtype=='int64':
    data[col].fillna(median(data[col]),inplace=True) #fill missing values of int and float type columns into their median value
  else:
    data[col].fillna(mode,inplace=True) #fill the missing values of other type like string with mode of column values

Dataset description
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       569 non-null    int64  
 1   1       569 non-null    object 
 2   2       569 non-null    float64
 3   3       569 non-null    float64
 4   4       569 non-null    float64
 5   5       569 non-null    float64
 6   6       569 non-null    float64
 7   7       569 non-null    float64
 8   8       569 non-null    float64
 9   9       569 non-null    float64
 10  10      569 non-null    float64
 11  11      569 non-null    float64
 12  12      569 non-null    float64
 13  13      569 non-null    float64
 14  14      569 non-null    float64
 15  15      569 non-null    float64
 16  16      569 non-null    float64
 17  17      569 non-null    float64
 18  18      569 non-null    float64
 19  19      569 non-null    float64
 20  20      569 non-null    float64
 21  21      569 non-nul

Splitting of train and test data

In [4]:
X1=data[0] #extract the 0 column alone
X2=data.iloc[:,2:] #extract column other than 0,1
X=pd.concat([X1,X2],axis=1) #contact the above dataframe to form features data frame
y=data[1] #extract label column here its is col number 1
test_size=float(input("Enter the test ratio:")) #eg:0.2 for 20 percent
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size,random_state=42)
#split data into train and test(order should not be changed)

Enter the test ratio:0.2


Model build,train,predict

In [7]:
no_of_trees=int(input("Enter the number of trees:")) #no of trees used
criterion=input("Enter the  function to measure quality of the split:") # any measures from {“gini”, “entropy”, “log_loss”} must be given
max_depth=int(input("Maximum depth of the tree:"))
bootstrap=bool(input("Is bootstrap required?")) # if required give value as "True" else give "False"

Enter the number of trees:50
Enter the  function to measure quality of the split:entropy
Maximum depth of the tree:4
Is bootstrap required?True


In [8]:
clf=RandomForestClassifier(n_estimators=no_of_trees,criterion=criterion,max_depth=max_depth,bootstrap=bootstrap) #build model
clf.fit(X_train,y_train) #train the model using train feature columns and their respective label column
y_pred=clf.predict(X_test) #test the model using test feature columns

Confusion matrix and other metrics

In [9]:
confusionMatrix=metrics.confusion_matrix(y_test,y_pred) #confsion metrics
print("Confusion Matrix:",confusionMatrix)
#confusion matrix format
#[[TruePositive FalseNegative]
#[FalsePositive TrueNegative]]
accuracy=metrics.accuracy_score(y_test,y_pred) # accuracy=(Truepoistive+ TrueNegative)/total observations
print("Accuracy:",round(accuracy,4)) #round function used to round the op values
precision=metrics.precision_score(y_test,y_pred,pos_label="M") #precision=TruePositive/(TruePositive+FalsePositive)
print("Precision:",round(precision,4))
recall=metrics.recall_score(y_test,y_pred,pos_label="M")#recall=TruePositive/(TrueNegative+TruePositive)
print("Recall:",round(recall,4))
f1Score=metrics.f1_score(y_test,y_pred,pos_label="M") #=2*(1/(1/precision)+(1/recall))
print("F1_score:",round(f1Score,4))

Confusion Matrix: [[70  1]
 [ 3 40]]
Accuracy: 0.9649
Precision: 0.9756
Recall: 0.9302
F1_score: 0.9524
