In [1]:
import sklearn
import sklearn.datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sklearn.ensemble
import numpy as np
import lime
import lime.lime_tabular
from collections import defaultdict
np.random.seed(1)

In [2]:
data = np.genfromtxt("/Users/mbakogu/Desktop/Academics/AIML_Research_Sameer_Singh/Adult/adult.data", delimiter=', ', dtype = str)
classifier = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
explainer = lime.lime_tabular.LimeTabularExplainer
feature_names = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status","Occupation", 
                 "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss","Hours per week", "Country"]

categorical_features = [1,3,5,6,7,8,9,13]


In [3]:
class GenerateExplanations():
    
    def __init__(self, data, classifier, explainer, feature_names = [], categorical_features = [], train_percentage = 0.80):
        """
        data: n columns consisting of n-1 column features followed by 1 column classification
        
        classifier: any initialized classifier
        e.g.: xgboost.XGBClassifier(n_estimators=300, max_depth=5)
        
        explainer: uninitiated explainer
        e.g.: lime.lime_tabular.LimeTabularExplainer
        
        feature_names: optional list representing names of each feature (excluding classification); integers 
        used if omitted
        e.g.: [name, gender, age, money] corresponds to features 0-3 in data
        
        categorical_features = denotes which features, by index, are categorical data
        e.g.: [2,7,9] notes that features in column 2,7, and 9 are categorical
        """
        
        self.data = data[:,:]
        self.classifier = classifier
        self.categorical_features = categorical_features
        if feature_names == []:
            feature_names = [str(x) for x in range(len(self.data[0,:-1]))]
            
        self.labels = self.data[:,len(self.data[0,:])-1]
        self.data = self.data[:,:-1] 
        class_names = set(self.labels)
        categorical_names = {}     
        
        if data.dtype != "float64":
            le = sklearn.preprocessing.LabelEncoder()
            le.fit(self.labels)
            self.labels = le.transform(self.labels)
            class_names = le.classes_
            
            self.data, categorical_names = self.process_data(self.data)
            
        self.data = self.data.astype(float)
        
        self.encoder = sklearn.preprocessing.OneHotEncoder(categorical_features=self.categorical_features)
                  
        self.train, self.test, self.labels_train, self.labels_test = sklearn.model_selection.train_test_split(self.data, self.labels, 
                                                                        train_size=train_percentage, test_size=1-train_percentage)
        
        self.encoder.fit(self.data)
        
        encoded_train = self.encoder.transform(self.train)
        
        self.classifier.fit(encoded_train, self.labels_train)
        
        self.explainer = explainer(self.train, feature_names = feature_names, class_names=class_names,
                                                   categorical_features=self.categorical_features, 
                                                   categorical_names=categorical_names, kernel_width=3)
        
                
        self.train_standard = np.array(self.normalize(self.train))
        self.test_standard = np.array(self.normalize(self.test))
  
                
    def normalize(self, data):
        normalized_data = [[0 for x in range(len(data[0,:]))] for x in range(len(data[:,0]))]
        
        for col in range(len(data[0,:])):
            total = 0
            for item in range(len(data[:,col])):
                total += abs(data[item,col])
                
            if total == 0:
                total = 1
                
            for item in range(len(data[:,col])):
                normalized_data[item][col] = float(abs(data[item,col]))/total
                
        return normalized_data
        

    def process_data(self, data):
        categorical_names = {}
        for i in self.categorical_features:
            le = sklearn.preprocessing.LabelEncoder()
            le.fit(data[:, i])
            data[:, i] = le.transform(data[:, i])
            categorical_names[i] = le.classes_
            
        return data, categorical_names
           
    def classify_accuracy(self, test_data, test_labels):
        sklearn.metrics.accuracy_score(test_labels, self.classifier.predict(self.encoder.transform(test_data)))
        
    def predict(self, instance):
        return self.classifier.predict_proba(self.encoder.transform(instance)).astype(float)
        
    def explain_with_lime(self, instance, num_features):
        return self.explainer.explain_instance(instance, self.predict, num_features=num_features).as_list()
    
    def k_cluster(self, data = None, k = 2, func = None):
        print("Kmeans for {} clusters".format(k))
        if data is None:
            data = self.test_standard
        
        if func is None:
            means = KMeans(n_clusters=k, random_state=0).fit_predict(data)
            return means
        
        return func(data)
        
      

In [4]:
gener = GenerateExplanations(data, classifier, explainer, feature_names, categorical_features)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [5]:
instances = np.array(gener.test)
instances_standard = np.array(gener.test_standard)
labels = np.array(gener.labels_test)
explanations = []
explanations_standard = []
    

In [6]:
for item in range(len(instances)):
    explanations.append(gener.explain_with_lime(instances[item],14))


In [8]:
for item in range(len(instances_standard)):
    explanations_standard.append(gener.explain_with_lime(instances[item],14))


In [9]:
for item in range(len(explanations)):
    for ele in range(len(explanations[item])):
        explanations[item][ele] = float(explanations[item][ele][1])

for item in range(len(explanations_standard)):
    for ele in range(len(explanations_standard[item])):
        explanations_standard[item][ele] = float(explanations_standard[item][ele][1])

explanations_standard = np.array(gener.normalize(np.array(explanations_standard)))
