In [None]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import CategoricalNB

In [None]:
#read dataset
names=['Class','handicapped-infants','water-project-cost-sharing','adoption-of-the-budget-resolution','physician-fee-freeze','el-salvador-aid','religious-groups-in-schools','anti-satellite-test-ban','aid-to-nicaraguan-contras','mx-missile','immigration','synfuels-corporation-cutback','education-spending','superfund-right-to-sue','crime','duty-free-exports','export-administration-act-south-africa']
f=pd.read_csv('house-votes.csv', header=None,names=names)

tr=f[names[1:]]
tr=tr.values.astype(int)

tar=f[['Class']]
tar[tar['Class']=='republican']=0
tar[tar['Class']=='democrat']=1
tar=tar.values.astype(int)

In [None]:
model = CategoricalNB()

# Fit the model
model.fit(tr, tar)

In [None]:
#Categories is a list. Its elements are lists that contain the values in each variables range, eg [[0,1],[1,2,3],...]
def positive_rules_with_categorical(model,categories):
    positive_rules=[]
    import itertools
    combinations = list(itertools.product(*categories))
    for comb in combinations:
        prediction= model.predict(np.array(comb).reshape(1, -1))[0]  
        if(prediction==1):
            numofvars= list(range(1,len(categories)+1))
            positive_rules.append(list(map(lambda x: 'x'+str(numofvars.pop(0))+'='+str(x) ,comb)))
    return positive_rules

In [None]:
#call it with categorical names, eg x2
def rewrite_categorical_rules(rules, cat_vars, cat_levels):
    new_names=[]
    for var in range(len(cat_vars)):
        new_vars=[]
        new_names.append(new_vars)
        for i in range(cat_levels[var]):
            new_vars.append(cat_vars[var]+'_'+str(i+1))
        for rule in rules:
            for feat in range(len(rule)):
                v, value =  rule[feat].split('=',1)
                if(cat_vars[var] == v):
                    rule[feat]=new_vars[int(value)-1]+'=1'
    return rules, new_names


In [None]:
#3o
variable_names=names
names_dict={k: v for k, v in zip(range(1,len(variable_names)+1),variable_names)}

In [None]:
#import solver
from ortools.linear_solver import pywraplp
solver = pywraplp.Solver.CreateSolver('SCIP')

In [None]:
#cat_vars is a list with the names of the categorical variables
#categories is a list where each element corresponds to the number of levels the corresponding categorical variable has
#names_dict is a dictionary that has a numbering of the variables (defind above)
def make_variables(names_dict,solver, cat_vars=[],categories=[]):
    variables=[]
    for var in names_dict.values():
        if var in cat_vars:
            index= cat_vars.index(var)
            levels=categories[index]
            num=list(names_dict.keys())[list(names_dict.values()).index(var)]      
            for i in range(levels):
                variables.append(solver.IntVar(0.0, 1.0, 'x'+str(num)+'_'+str(i+1)))
        else:
            num= list(names_dict.keys())[list(names_dict.values()).index(var)]
            variables.append(solver.IntVar(0.0, 1.0, 'x'+str(num)))
    return variables

In [None]:
#new_categorical_names is a list. Its elements are lists, each one having the new names for the introduced categorical variable
#eg if the variable was x2, the element is [x2_1, x2_2,...]
def add_categorical_constrs(new_categorical_names, solver, variables):
    for new_name in new_categorical_names:
        sum=0
        for i in range(len(variables)):
            if (variables[i].name() in new_name):
                print(variables[i].name())
                sum=sum+variables[i]
        solver.Add(sum==1)

In [1]:
#diversity constraints
#the 3 first are list of tuples: (feat, _constraint value)
#must have initialized the optimization problem and all dicts and remaining constraints. This is the last step before solving!
#cat_vars list with the original names, not x1, x2
def diverse_counterfactual_constraints(eq_constrs, variables, cat_vars, solver):
    for constr in eq_constrs:
        feat, constr_val = constr

        if (feat in cat_vars):
            var=list(names_dict.keys())[list(names_dict.values()).index(feat)]
            var_index=index_dict['x'+str(var)+'_'+str(constr_val)]
            solver.Add(variables[var_index]==1)
        else:
            var=list(names_dict.keys())[list(names_dict.values()).index(feat)]
            var_index=index_dict['x'+str(var)]
            solver.Add(variables[var_index]==1)
                


In [None]:
#construct the left hand side of the constraints, assuming we use the positive polynomial
#positiverules is the list of the modified rules
#variables is the list with the optimization variables
#counter_outcome is a binary variable, selecting what the counterfactual outcome should be.
def add_constraints(positiverules, variables, counter_outcome, solver):
    constrs=[]
    num=0
    for rule in positiverules:
        sum=0
        for feat in rule:
            if (feat[-1]=="0"):
                sum=sum + 1-variables[int(index_dict[feat.split('=',1)[0]])] #einai lathos auto!! prepei na to valw na pairnei ton arithmo, to teleutaio
            else:                                     #pshfio den einai arketo! mporei na einai 2pshfios!
                sum=sum + variables[int(index_dict[feat.split('=',1)[0]])]      #variables[int(''.join(filter(str.isdigit, feat.split('=',1)[0])))-1]  #int(''.join(filter(str.isdigit, feat.split('=',1)[0]))) auth einai mallon
            num=num+1                                 #h lush!   variables[int(feat[1])-1]
        constrs.append((sum,num))
        num=0
    
    if(counter_outcome==0):
        #add constraints to enforce negative outcome, using the positive polynomial (so all positive rules must be zero)
        for const in constrs:
            solver.Add(const[0] <= const[1]-1)
    elif(counter_outcome==1):
        #alternatively, these constraints enforce positive outcome (using the new result, having the positive polynomial equal 1)
        sum=0
        for const in constrs:
            delta=solver.IntVar(0.0, 1.0, 'delta')
            solver.Add(const[0] >= delta*const[1])
            sum=sum+delta
        solver.Add(sum==1)

In [None]:
#datapoint is the original datapoint
#cat_vars is a list with the names of the categorical variables (the original ones, not x1, x2..)
#categories is a list with elements the number of levels the corresponding categorical variable has
#variable_names list with all the original names
def make_actual_datapoint_with_categorical(datapoint, cat_vars=[],categories=[],variable_names=names):
    actualdatapoint=[]
    for i in range(len(datapoint)):
        if variable_names[i] in cat_vars:
            cat= datapoint[i]
            index=cat_vars.index(variable_names[i])
            for j in range(1,categories[index]+1):
                if (j==cat):
                    actualdatapoint.append(1)
                    continue
                actualdatapoint.append(0)
        else:
            actualdatapoint.append(datapoint[i])
    data= np.array(actualdatapoint)
    return data

In [None]:
#construct the objective function
#actualdatapoint is the transformed datapoint, with the ctegorical variables
#coeff are the corresponding coefficients
#variables is the list with the optimization variables
def make_objective(actualdatapoint, coeff, variables, solver):
    obj=0
    index=0
    for feat in actualdatapoint:
        if feat==0:
            obj=obj+coeff[index]*variables[index]
        else:
            obj=obj+coeff[index]*(1-variables[index])
        index=index+1
    solver.Minimize(obj)

In [None]:
def make_counterfactual(solution,cat_vars=[]):
    counterfactual=[]
    for var in solution:
        if ('_' in var.name()):
            if (var.solution_value()==1):
                variable, level=var.name().split('_',1) 
                counterfactual.append(int(level))
        else:
            counterfactual.append(int(var.solution_value()))
    return np.array(counterfactual)

In [None]:
#names is a list with the original variale names
def written_counterfactual(actualdatapoint, counterfactual, names):
    changes=[]
    for i in range(len(actualdatapoint)):
        if (actualdatapoint[i]!=counterfactual[i]):
            changes.append(names[i]+' = '+str(counterfactual[i]))
    return changes

In [None]:
def make_coefficients(dataset, cat_vars=[],categories=[],variable_names=names):
    coeff=[]
    for i in range(len(variable_names)):
        if variable_names[i] in cat_vars:
            index=cat_vars.index(variable_names[i])
            masked_data= dataset[:,i]
            n=len(masked_data)
            for j in range(1,categories[index]+1):
                category = masked_data[masked_data==j]
                p=len(category)
                p=p/n
                variance=p*(1-p)
                coeff.append(np.sqrt(variance))
        else:
            masked_data= dataset[:,i]
            n=len(masked_data)
            masked_data=masked_data[masked_data==1]
            p=len(masked_data)
            p=p/n
            variance=p*(1-p)
            coeff.append(np.sqrt(variance))
    return np.array(coeff)

In [None]:
counterfactuals=[]
K=5

indices = random.sample(range(len(tr)), K)
exp_data=[tr[i] for i in sorted(indices)]
exp_tar=[model.predict(exp_data[i].reshape(1, -1)) for i in range(len(exp_data))]

for i in range(K):
    datapoint = exp_data[i]
    counter_outcome = 0 if exp_tar[i] == 1 else 1
    
    #initialize solver
    solver = pywraplp.Solver.CreateSolver('SCIP')
    
    variables=make_variables(names_dict=names_dict,solver=solver, cat_vars=[],categories=[])
    #new_categorical_names=[['x1_1','x1_2','x1_3','x1_4','x1_5','x1_6','x1_7','x1_8','x1_9'],['x2_1','x2_2','x2_3'],['x3_1','x3_2','x3_3','x3_4','x3_5','x3_6','x3_7','x3_8','x3_9','x3_10','x3_11','x3_12'],['x4_1','x4_2','x4_3','x4_4','x4_5','x4_6','x4_7','x4_8','x4_9','x4_10','x4_11','x4_12','x4_13'],['x6_1','x6_2','x6_3'],['x8_1','x8_2','x8_3','x8_4','x8_5']]
    #add_categorical_constrs(new_categorical_names, solver, variables)
    
    index_dict={}
    for i in range(len(variables)):
        index_dict[variables[i].name()]=i
    print('mphka 4')
    add_constraints(positive_rules, variables, counter_outcome, solver)
    new_datapoint=make_actual_datapoint_with_categorical(datapoint, cat_vars=[],categories=[],variable_names=names)
    print('mphka 5')
    coeff=make_coefficients(tr, cat_vars=[],categories=[],variable_names=names)
    make_objective(new_datapoint, coeff, variables, solver)
    print('mphka 6')
    status = solver.Solve()
    print('mphka 7')
    cntr=make_counterfactual(variables)
    counterfactual=written_counterfactual(new_datapoint, cntr, names)
    counterfactuals.append(counterfactual)