In [1]:
# To load the Pandas libraries as an alias 'pds' 
import pandas as pds
# To load the numpy libraries as an alias 'npy' 
import numpy as npy 

from random import sample

#loading data
BreastCancer = pds.read_csv("breast-cancer.csv")

#Replacing -ve outcomes by negative and +ve outcomes by positive as asked in the question
BreastCancer = BreastCancer.replace('no-recurrence-events', 'Negative')
BreastCancer = BreastCancer.replace('recurrence-events', 'Positive')

#Creating a copy of dataframe to extract training and testing dataset
BreastCancer_copy = BreastCancer.copy()



#80% data for training set
Training_set = BreastCancer_copy.sample(frac=0.7, random_state=0)

#20% data for testing set
Testing_set = BreastCancer_copy.drop(Training_set.index)

# getting list of column names
Col_names = list(Training_set.columns.values)
# print(Col_names)

#Calculating occurences of the class
class_occurences=Training_set["Class"].value_counts()
# print(class_occurences)

#Calculating probability of the class
prob=0
class_prob={}
for key,value in class_occurences.items():
    prob=value/len(Training_set)
    class_prob[key]=prob
# print(class_prob)    
                  
#Calculating Prior Probability
dic_uni={}
for i in Col_names:
    if i!='Class':
        m=len(Training_set[i].unique())
        dic_uni[i]=m
#print(dic_uni)

#Calculating conditional Probability
dic_Conditional_Prob={}
Value_dict = {}
final_list = []
Final_dict = {}
for i in Col_names:
    if i!='Class':
        #finding probability of each feature given class
#         CProb = (Training_set.groupby(i)['Class'].value_counts() + 1) / Training_set.groupby(i)['Class'].count()
        CProb = (Training_set.groupby(i)['Class'].value_counts() + 1)
        dic_Conditional_Prob[i] = CProb        
        Value_dict = CProb.to_dict()
#         print(Value_dict)
    Final_dict.setdefault(i,{}).update(Value_dict)
#print(Final_dict)

#M-estimate Probability: (count of feature given class * mp) / (count of class +m)
M_est_dict = {}
Dict = {}
for key,value in Final_dict.items():
    for key1,value1 in value:
        M_est = (class_occurences[value1]) 
        MProb = value[key1,value1]/(M_est+dic_uni[key])
        M_est_dict[(key1,value1)]=MProb
    Dict.setdefault(key,{}).update(M_est_dict)
#print(Dict)
 
Count_Negative = 0
Count_Positive = 0

#Calculating probability of entire row for testing instance and multiplying it by prob(+ve) and prob(-ve) and 
# assigning prediction based on value which is higher
final_pred={}
for key,value in Testing_set.iterrows():
    Negative_outcomes=1
    Positive_outcomes=1
    for i in Col_names:        
        if i != 'Class':
            Negative_outcomes = (Dict[i][(value[i],'Negative')]) * Negative_outcomes          
            Positive_outcomes = (Dict[i][(value[i],'Positive')]) *  Positive_outcomes            
    Neg_pred=Negative_outcomes*class_prob['Negative']
    pos_pred=Positive_outcomes*class_prob['Positive']
    
    #If negative prob is higher than positive prob predicted class is negative otherwise positive
    if(Neg_pred > pos_pred):
        Pred_Class = 'Negative'
    else:
        Pred_Class = 'Positive'

    final_pred[key]=Pred_Class
# print(final_pred)    

#Getting values of predicted class from above and storing it in a dataframe
df_final_pred=pds.DataFrame.from_dict(final_pred, orient='index')
df_final_pred.columns = ['Predicted_Class']
df_final_pred

resulting_df = Testing_set.join(df_final_pred)
resulting_df




Unnamed: 0,Class,menopause,node-caps,deg-malig,breast,breast-quad,irradiat,Predicted_Class
1,Negative,premeno,no,2,right,right_up,no,Negative
6,Negative,premeno,no,2,left,left_low,no,Negative
9,Negative,premeno,no,2,right,left_up,no,Negative
11,Negative,ge40,no,2,left,left_low,no,Negative
17,Negative,premeno,no,3,left,left_low,no,Positive
...,...,...,...,...,...,...,...,...
273,Positive,ge40,yes,2,right,right_up,yes,Positive
275,Positive,premeno,yes,3,right,left_up,no,Positive
276,Positive,premeno,yes,3,left,right_low,yes,Positive
277,Positive,ge40,yes,3,left,left_up,yes,Positive


In [2]:
#Updating values of the dataframe - predicted class to resulting predictions.csv
resulting_df.to_csv(r'predictions.csv')

#Calculating Confusion Matrix values after getting TruePositives, FalseNegatives, FalsePositives, TrueNegatives
TruePositives = FalseNegatives = FalsePositives = TrueNegatives = 0
for index,value in resulting_df.iterrows():
    if value['Class'] == value['Predicted_Class'] and value['Class'] == 'Positive':
        TruePositives += 1
    if value['Class'] == value['Predicted_Class'] and value['Class'] == 'Negative':
        TrueNegatives += 1
    if value['Class'] != value['Predicted_Class'] and value['Class'] == 'Positive':
        FalseNegatives += 1
    if value['Class'] != value['Predicted_Class'] and value['Class'] == 'Negative':
        FalsePositives += 1

#Applying Accuracy, Sensitivity and Specificity formulas
Accuracy = (TruePositives + TrueNegatives) / (TruePositives + FalsePositives + TrueNegatives + FalseNegatives)
Sensitivity = TruePositives / (TruePositives + FalseNegatives)
Specificity = TrueNegatives / (TrueNegatives + FalsePositives)

#Printing Accuracy, Sensitivity and Specificity
print("ACCURACY", Accuracy)
print("SENSITIVITY", Sensitivity)
print("SPECIFICITY", Specificity)


ACCURACY 0.7441860465116279
SENSITIVITY 0.5454545454545454
SPECIFICITY 0.8125
