In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import time
from datetime import datetime
#from Wisard import Wisard
import wisardpkg as wp
import math
import sys

In [2]:
params = {}

In [3]:
# The preprocessing part is partialy based on the following "kernel" on Kaggle:
# https://www.kaggle.com/jgoldberg/donorschoose-eda-text-classification/notebook

def preprocess(training_dataframe, resources_dataframe):
    #print(training_dataframe.shape)
    #print(resources_dataframe.shape)
    
    #
    total_price = resources_dataframe.quantity * resources_dataframe.price
    resources_dataframe["total_price"] = total_price
    
    # dropping irrelevant columns
    resources_dataframe = resources_dataframe.drop(["description", "price"], axis=1)
    training_dataframe = training_dataframe.drop(["teacher_id"], axis=1)
    
    # grouping resources data by id
    grouped_resources_dataframe = resources_dataframe.groupby("id", as_index=False, sort=False).sum()
    grouped_resources_dataframe
    
    # merging the two dataframes
    cleaned_df = pd.merge(training_dataframe, grouped_resources_dataframe, how="inner", on=["id"])
    
    # splitting project categories
    
    cleaned_df[['category_1','category_2']] = cleaned_df['project_subject_categories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True)
    #print(cleaned_df['project_subject_categories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True))
    
    cleaned_df[['subcategory_1','subcategory_2']] = cleaned_df['project_subject_subcategories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True)
    
    #cleaned_df["category_1"] = cleaned_df["category_1"].fillna("Not Informed")
    cleaned_df["category_2"] = cleaned_df["category_2"].fillna("Not Informed")
    cleaned_df["subcategory_2"] = cleaned_df["subcategory_2"].fillna("Not Informed")
    
    cleaned_df["total_price_category"] = pd.cut(
        cleaned_df["total_price"], 
        bins=[0,100,250,500,1000,16000], 
        labels=["0-100","101-250","251-500","501-1000",">1000"]
    )
    
    cleaned_df["n_previous_projects"] = pd.cut(
        cleaned_df["teacher_number_of_previously_posted_projects"],
        bins=[-1,1,5,10,25,50,500],
        labels=['0-1','2-5','6-10','11-25','26-50','51+']
    )
    
    cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
    cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
    cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter
    
    cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")
    
    cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
    cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
    cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
    cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")
    
    #cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    cleaned_df["merged_essays"] = cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    
    # dropping more columns
    cleaned_df = cleaned_df.drop([
        "project_submitted_datetime", 
        "project_essay_1", 
        "project_essay_2", 
        "project_essay_3", 
        "project_essay_4",
        "quantity",
        "total_price",
        "teacher_number_of_previously_posted_projects"], 
        axis=1
    )
    
    return cleaned_df


def get_bits_string(value_order, encoding, length, thershold = None):
    if (value_order <= 0) or (value_order > length):
        print("Value order should be 0 < value_order <= length.")
        #sys.exit(1)
        exit()
    
    string = ""
    
    # 2 = 0001, if length = 4
    if encoding == "standard": 
        if (value_order - 1) > math.pow(2, length):
            print("It is not possible to represent " + str(value_order) + " different values with " + str(length) + " bits.")
        #sys.exit(1)
        exit()
        
        string = ("{0:0" + str(length) + "b}").format(value_order - 1)
        
    # 2 = 1100, if length = 4
    elif encoding == "thermometer": 
        string = "1" * value_order + "0" * (length - value_order)
    
    # 2 = 0100, if length = 4
    elif encoding == "onehot":
        string = "0" * length
        string = list(string)
        string[value_order - 1] = "1"
        string = "".join(string)
        
    elif encoding == "threshold":
        if threshold != None:
            if value_order > threshold:
                string = "1"
            else:
                string = "0"
        else:
            print("Threshold not provided.")
            #sys.exit(1)
            exit()
    
    else:
        print("Invalid encoding: " + str(encoding) + ".\nValid encodings are \"standard\", \"thermometer\", \"onehot\" and \"threshold.\".")
        #sys.exit(1)
        exit()
        
    return string

def choose_field_size(minimum_size, expected_size):
    if minimum_size > expected_size:
        return minimum_size
    else:
        return expected_size
    

def convert_to_bits_string(dataframe, params):
    print("Converting dataframe of shape ", dataframe.shape, " to a list of binary values.")
    
    project_grade_category_mapping = { # 4
        'Grades PreK-2':  get_bits_string(1, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]),
        'Grades 3-5':     get_bits_string(2, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]), 
        'Grades 6-8':     get_bits_string(3, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]), 
        'Grades 9-12':    get_bits_string(4, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"])
    }

    teacher_prefix_mapping = { # 6
        'Ms.':     get_bits_string(1, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
        'Mrs.':    get_bits_string(2, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
        'Mr.':     get_bits_string(3, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
        'Teacher': get_bits_string(4, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
        'Dr.':     get_bits_string(5, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
        'unknown': get_bits_string(6, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"])
    }

    n_previous_projects_mapping = { # 6
        '0-1':     get_bits_string(1, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        '2-5':     get_bits_string(2, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        '6-10':    get_bits_string(3, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        '11-25':   get_bits_string(4, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        '26-50':   get_bits_string(5, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        '51+':     get_bits_string(6, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"])
    }

    total_price_category_mapping = { # 6
        "0-100":     get_bits_string(1, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        "101-250":   get_bits_string(2, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        "251-500":   get_bits_string(3, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        "501-1000":  get_bits_string(4, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]),
        ">1000":     get_bits_string(5, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"])
    }
    
    month_mapping = { # 12
        "1":  get_bits_string(1, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "2":  get_bits_string(2, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "3":  get_bits_string(3, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "4":  get_bits_string(4, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "5":  get_bits_string(5, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "6":  get_bits_string(6, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "7":  get_bits_string(7, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "8":  get_bits_string(8, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "9":  get_bits_string(9, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "10": get_bits_string(10, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "11": get_bits_string(11, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
        "12": get_bits_string(12, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"])
    }
    
    quarter_mapping = { # 4
        "1": get_bits_string(1, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]),
        "2": get_bits_string(2, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]),
        "3": get_bits_string(3, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]),
        "4": get_bits_string(4, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"])
    }
    
    category_mapping = { # 10
        "Not Informed":          get_bits_string(1, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Applied Learning":      get_bits_string(2, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Health & Sports":       get_bits_string(3, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "History & Civics":      get_bits_string(4, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Literacy & Language":   get_bits_string(5, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Math & Science":        get_bits_string(6, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Music & The Arts":      get_bits_string(7, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Special Needs":         get_bits_string(8, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Warmth":                get_bits_string(9, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]),
        "Care & Hunger":         get_bits_string(9, params["encoding"], choose_field_size(10, params["field_size"]), params["encoding_threshold"]), # Equals to warmth, because they are the same thing
    }
    
    subcategory_mapping = { # 30
        "Not Informed":          get_bits_string(1, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Literacy":              get_bits_string(2, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Performing Arts":       get_bits_string(3, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Applied Sciences":      get_bits_string(4, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Health & Wellness":     get_bits_string(5, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Character Education":   get_bits_string(6, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Early Development":     get_bits_string(7, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Mathematics":           get_bits_string(8, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Literature & Writing":  get_bits_string(9, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]),
        "Special Needs":         get_bits_string(10, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "ESL":                   get_bits_string(11, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Health & Life Science": get_bits_string(12, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "College & Career Prep": get_bits_string(13, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Environmental Science": get_bits_string(14, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Other":                 get_bits_string(15, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Music":                 get_bits_string(16, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Visual Arts":           get_bits_string(17, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "History & Geography":   get_bits_string(18, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Gym & Fitness":         get_bits_string(19, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Warmth":                get_bits_string(20, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Extracurricular":       get_bits_string(21, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Team Sports":           get_bits_string(22, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Social Sciences":       get_bits_string(23, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Foreign Languages":     get_bits_string(24, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Parent Involvement":    get_bits_string(25, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Nutrition Education":   get_bits_string(26, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Community Service":     get_bits_string(27, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Financial Literacy":    get_bits_string(28, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Civics & Government":   get_bits_string(29, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        "Economics":             get_bits_string(30, params["encoding"], choose_field_size(30, params["field_size"]), params["encoding_threshold"]), 
        
    }
    
    school_state_mapping = { # 52 (51 possible values)
        'NV': get_bits_string(1, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'GA': get_bits_string(2, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]),
        'UT': get_bits_string(3, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NC': get_bits_string(4, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'CA': get_bits_string(5, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'DE': get_bits_string(6, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'MO': get_bits_string(7, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'SC': get_bits_string(8, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'IN': get_bits_string(9, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'IL': get_bits_string(10, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'VA': get_bits_string(11, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]),
        'PA': get_bits_string(12, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NY': get_bits_string(13, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'FL': get_bits_string(14, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NJ': get_bits_string(15, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'TX': get_bits_string(16, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'LA': get_bits_string(17, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'ID': get_bits_string(18, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'OH': get_bits_string(19, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'OR': get_bits_string(20, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'MD': get_bits_string(21, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'WA': get_bits_string(22, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]),
        'MA': get_bits_string(23, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'KY': get_bits_string(24, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'AZ': get_bits_string(25, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'MI': get_bits_string(26, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'CT': get_bits_string(27, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'AR': get_bits_string(28, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'WV': get_bits_string(29, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NM': get_bits_string(30, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'WI': get_bits_string(31, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'MN': get_bits_string(32, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'OK': get_bits_string(33, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]),
        'AL': get_bits_string(34, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'TN': get_bits_string(35, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'IA': get_bits_string(36, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'KS': get_bits_string(37, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'CO': get_bits_string(38, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'DC': get_bits_string(39, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'WY': get_bits_string(40, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NH': get_bits_string(41, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'HI': get_bits_string(42, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'SD': get_bits_string(43, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'MT': get_bits_string(44, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]),
        'MS': get_bits_string(45, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'RI': get_bits_string(46, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'VT': get_bits_string(47, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'ME': get_bits_string(48, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'NE': get_bits_string(49, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'AK': get_bits_string(50, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"]), 
        'ND': get_bits_string(51, params["encoding"], choose_field_size(52, params["field_size"]), params["encoding_threshold"])
    }
    
    combined_input_and_expected_output = []
    input_list = []
    expected_output_list = []
    
    n = 0
    for index, row in dataframe.iterrows():
        #print(row)
        #if n >= 10:
        #    break
        #n = n + 1
        
        # total 128 bits
        bits_string = ""
        bits_string = project_grade_category_mapping[row["project_grade_category"]]
        #bits_string = bits_string + teacher_prefix_mapping[row["teacher_prefix"]]
        bits_string = bits_string + n_previous_projects_mapping[row["n_previous_projects"]]
        bits_string = bits_string + total_price_category_mapping[row["total_price_category"]]
        
        #bits_string = bits_string + month_mapping[str(row["month"])]
        #bits_string = bits_string + quarter_mapping[str(row["quarter"])]
        #bits_string = bits_string + category_mapping[row["category_1"]]
        #bits_string = bits_string + category_mapping[row["category_2"]] # perhaps it is possible to ignore this one
        
        #bits_string = bits_string + subcategory_mapping[row["subcategory_1"]]
        #bits_string = bits_string + subcategory_mapping[row["subcategory_2"]]
        #bits_string = bits_string + school_state_mapping[row["school_state"]]
        
        bit_int_list = [int(c) for c in bits_string]
        expected_output = str(row["project_is_approved"])
        
        input_list.append(bit_int_list)
        expected_output_list.append(expected_output)
        
        combined_input_and_expected_output.append([bit_int_list, expected_output])
        
    return input_list, expected_output_list, combined_input_and_expected_output

In [4]:
def loadData():
    # load data
    train_file_path = 'train.csv'
    # Test data file not considered because it doesn't contain the classes of the entries
    #test_file_path = 'test.csv'
    resources_file_path = 'resources.csv'

    # Read data and store in DataFrame
    train_data = pd.read_csv(train_file_path, sep=',')
    #test_data = pd.read_csv(test_file_path, sep=',')
    resources_data = pd.read_csv(resources_file_path, sep=',')
    
    return train_data, resources_data

# splitting the training dataset into training and test, because the official test dataset
# doesn't have the entries' classification, requiring validation with Kaggle's website
def splitData(train_data, resources_data, training_set_total_aproved, training_set_total_reproved):
    
    print("Total data: ", len(train_data))
    print("Total aproved: ", train_data["project_is_approved"].sum())
    print("Total reproved: ", len(train_data) - train_data["project_is_approved"].sum())
    print("Percent aproved: ", float(train_data["project_is_approved"].sum()) / float(len(train_data)))
    print("Percent reproved: ", 1.0 - (float(train_data["project_is_approved"].sum()) / float(len(train_data))), "\n")

    #train = train_data.sample(n=10000,random_state=200)
    train = train_data.sample(frac=1,random_state=200)
    print("Distribution over a random sample of 182080 observations used to get the observations to train the classifier: ",
          float(train["project_is_approved"].sum()) / float(len(train["project_is_approved"])))
    print("Total aproved in that sample: ", train["project_is_approved"].sum(), "\n")
    
    aproved = train[train["project_is_approved"] == 1][:training_set_total_aproved]
    reproved = train[train["project_is_approved"] == 0][:training_set_total_reproved]

    training_set = pd.concat([aproved, reproved])
    training_set = training_set.sample(frac=1, random_state=200)
    test_set = train_data.drop(training_set.index)

    print("Total training data: ", len(training_set))
    print("Total aproved: ", training_set["project_is_approved"].sum())
    print("Total reproved: ", len(training_set) - training_set["project_is_approved"].sum())
    print("Percent aproved: ", float(training_set["project_is_approved"].sum()) / float(len(training_set)))
    print("Percent reproved: ", 1.0 - (float(training_set["project_is_approved"].sum()) / float(len(training_set))), "\n")

    print("Total test data: ", len(test_set))
    print("Total aproved: ", test_set["project_is_approved"].sum())
    print("Total reproved: ", len(test_set) - test_set["project_is_approved"].sum())
    print("Percent aproved: ", float(test_set["project_is_approved"].sum()) / float(len(test_set)))
    print("Percent reproved: ", 1.0 - (float(test_set["project_is_approved"].sum()) / float(len(test_set))), "\n")

    print("Training set + test set: ", len(training_set) + len(test_set))

    return training_set, test_set


def getData(training_set_total_aproved, training_set_total_reproved, params):
    train_data, resources_data = loadData()
    training_set, test_set = splitData(train_data, resources_data, training_set_total_aproved, training_set_total_reproved)
    
    training_df = preprocess(training_set, resources_data)
    test_df = preprocess(test_set, resources_data)
    
    training_input, expected_output, training_combined = convert_to_bits_string(training_df, params)
    test_input, test_expected_output, test_combined = convert_to_bits_string(test_df, params)
    
    return training_input, expected_output, training_combined, test_input, test_expected_output, test_combined, training_set, test_set

In [5]:
# Trains using a WiSARD classifier
# Using personal implementation, without bleaching
def train(algorithm, training_input, expected_output, params, tuple_size = 2, bleaching = False, seed = 0):
    #wann = Wisard(tuple_size, 3546, bleaching)
    #wann.train(training_input, expected_output)
    
    wann = None
    
    if algorithm == "wisard":
        print("Algorithm: WiSARD.")
        wann = wp.Wisard(tuple_size, bleachingActivated=bleaching, ignoreZero=False, verbose=True)
        wann.train(training_input,expected_output)
    elif algorithm == "cluswisard":
        print("Algorithm: ClusWiSARD.")
        wann = wp.ClusWisard(tuple_size, params["min_score"], params["threshold"], params["discriminator_limit"])
        wann.train(training_input,expected_output)
    
    
    return wann

In [6]:
#Evaluates Guilherme's wisard implementation
def evaluate_performance(wann, test_data_combined, current_params):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    total_ties = 0
    ones_total_ties = 0
    zeros_total_ties = 0
    avg_time = time.time()
    
    for combined in test_data_combined:
        #prediction, tie = wann.classify([combined[0]], returnActivationDegree=True, returnConfidence=True)
        prediction = wann.classify([combined[0]])#, searchBestConfidence = False)#, returnActivationDegree=True, returnConfidence=True)
        #print(prediction)
        prediction = prediction[0]#["class"]
        tie = 0
        
        if prediction == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        
        expected = combined[1]
        #print(prediction, expected)
        if prediction == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction == "0":
                zeros_correct = zeros_correct + 1
            elif prediction == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction == "1":
                ones_wrong = ones_wrong + 1
                
        if tie:
            total_ties = total_ties + 1
            if expected == "0":
                zeros_total_ties = zeros_total_ties + 1
            elif expected == "1":
                ones_total_ties = ones_total_ties + 1
                
    avg_time = float(time.time() - avg_time) / float(len(test_data_combined)) 
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    print("Total ties: ", total_ties)
    print("Zeros total ties: ", zeros_total_ties)
    print("Ones total ties: ", ones_total_ties)
    print("Avg. Time: ", avg_time, " seconds.")
    return correct_predictions, [
        len(test_data_combined), correct_predictions, wrong_predictions, zeros_predicted, ones_predicted,
        zeros_correct, ones_correct, zeros_wrong, ones_wrong, total_ties, zeros_total_ties, ones_total_ties,
        avg_time
    ]

#Evaluates Firminos's wisard implementation
def evaluate_performance2(w, test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    avg_time = time.time()
    
    for combined in test_data_combined:
        prediction = w.predict([combined[0]])
        if prediction[0] == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction[0] == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction[0] == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction[0] == "0":
                zeros_correct = zeros_correct + 1
            elif prediction[0] == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction[0] == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction[0] == "1":
                ones_wrong = ones_wrong + 1
                
    avg_time = float(time.time() - avg_time) / float(len(test_data_combined)) 
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    print("Avg. Time: ", avg_time, " seconds.")
    return correct_predictions, [
        len(test_data_combined), correct_predictions, wrong_predictions, zeros_predicted, ones_predicted,
        zeros_correct, ones_correct, zeros_wrong, ones_wrong, avg_time
    ]

In [7]:
def write_result_line(algorithm, seed, encoding_type, field_size, training_set_distribuition, a_tuple_size, bleaching,
                     training_time, in_sample_additional_info, in_sample_evaluation_time, out_sample_additional_info,
                     out_sample_evaluation_time, in_sample_performance, training_combined, training_set, out_sample_performance,
                     test_combined, test_set, output_file):
    line = ""
    line_contents = [
        # Algorithm
        str(algorithm) + ";",
        # Seed
        str(seed) + ";",
        # Encoding type
        str(encoding_type) + ";",
        # Field size
        str(field_size) + ";",
        # training / test
        str(training_set_distribuition[0]) + "/" + str(training_set_distribuition[1]) + ";",
        # tuple size
        str(a_tuple_size) + ";",
        # bleaching active or not
        str(bleaching) + ";",
        # training time in seconds
        str(training_time) + ";",
        # in sample pattern average evaluation time in seconds
        str(in_sample_additional_info[12]) + ";",
        # in sample total evaluation time in seconds
        str(in_sample_evaluation_time) + ";",
        # out sample pattern average evaluation time in seconds
        str(out_sample_additional_info[12]) + ";",
        # out sample total evaluation time in seconds
        str(out_sample_evaluation_time) + ";",

        # total traning observations
        str(training_set_distribuition[0] + training_set_distribuition[1]) + ";",
        # total of correct prediction in the training dataset
        str(in_sample_performance) + ";",
        # percentage of right answers
        str(float(in_sample_performance) / float(len(training_combined))) + ";",
        # total approved in the training dataset
        str(training_set["project_is_approved"].sum()) + ";",
        # total approved correctly predicted in the training dataset
        str(in_sample_additional_info[6]) + ";",
        # total approved wrongly predicted in the training dataset
        str(in_sample_additional_info[8]) + ";",
        # percentage of approved projects predicted correctly in the training dataset
        str(float(in_sample_additional_info[6]) / float(training_set["project_is_approved"].sum())) + ";",
        # total reproved in the training dataset
        str((training_set["project_is_approved"].sum() - len(training_set["project_is_approved"])) * -1) + ";",
        # total reproved correctly predicted in the training dataset
        str(in_sample_additional_info[5]) + ";",
        # total reproved wrongly predicted in the training dataset
        str(in_sample_additional_info[7]) + ";",
        # percentage of reproved projects predicted correctly in the training dataset
        str(float(in_sample_additional_info[5]) / float((training_set["project_is_approved"].sum() - len(training_set["project_is_approved"])) * -1)) + ";",


        # total test observations
        str(len(test_set["project_is_approved"])) + ";",
        # total of correct prediction in the test dataset
        str(out_sample_performance) + ";",
        # percentage of right answers
        str(float(out_sample_performance) / float(len(test_combined))) + ";",
        # total approved in the test dataset
        str(test_set["project_is_approved"].sum()) + ";",
        # total approved correctly predicted in the test dataset
        str(out_sample_additional_info[6]) + ";",
        # total approved wrongly predicted in the test dataset
        str(out_sample_additional_info[8]) + ";",
        # percentage of approved projects predicted correctly in the test dataset
        str(float(out_sample_additional_info[6]) / float(test_set["project_is_approved"].sum())) + ";",
        # total reproved in the test dataset
        str((test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])) * -1) + ";",
        # total reproved correctly predicted in the training dataset
        str(out_sample_additional_info[5]) + ";",
        # total reproved wrongly predicted in the training dataset
        str(out_sample_additional_info[7]) + ";",
        # percentage of reproved projects predicted correctly in the training dataset
        str(float(out_sample_additional_info[5]) / float((test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])) * -1)) + ";",

        # total ties
        str(out_sample_additional_info[9]) + ";",
        # total ties when prediction should have been zero
        str(out_sample_additional_info[10]) + ";",
        # total ties when prediction should have been one
        str(out_sample_additional_info[11]) + ";",

        "\n",
        #str() + ";",
    ]

    for content in line_contents:
        #print(content)
        line = line + content

    print(line)

    file = open(output_file, "a+")
    file.write(line)
    file.close()

In [8]:
def experiment(params): #training_set_distribuitions, tuple_sizes, bleaching_mode = [False]):
    output_file = "insights/results_experiment" + datetime.now().strftime('%Y%m%d%H%M%S') + ".csv"
    file = open(output_file, "w")
    file.write("algorithm;seed;encoding;field_size;data_distribution;tuple_size;bleaching_active;total_training_time;avg_in_sample_evaluation_time;total_in_sample_evaluation_time;avg_out_sample_evaluation_time;total_out_sample_evaluation_time;total_training_data;total_correct_training;" +
               "percent_correct_training;total_approved_training;correctly_approved_training;wrongly_approved_training;" +
               "percent_approved_correctly_training;total_reproved_training;correctly_reproved_training;" +
               "wrongly_reproved_training;percent_reproved_correctly_training;total_test_data;total_correct_test;" +
               "percent_correct_test;total_approved_test;correctly_approved_test;wrongly_approved_test;" +
               "percent_approved_correctly_test;total_reproved_test;correctly_reproved_test;" +
               "wrongly_reproved_test;percent_reproved_correctly_test;total_ties;ties_for_zeros;ties_for_ones\n"
              )
    file.close()
    
    for seed in params["seeds"]:
        print("Seed: ", seed)
        
        for encoding_type in params["encoding_types"]:
            print("Input encoding type: ", encoding_type)
            
            for field_size in params["field_sizes"]:
                print("Input field sizes: " + field_size if field_size != 0 else "Default")
                
                for training_set_distribuition in params["training_set_distribuitions"]:
                    print("\nTraining with a training set distribution of ", 
                          training_set_distribuition[0], training_set_distribuition[1],
                          " for approved and repproved, respectively.\n")
                    
                    
                    current_params = {
                        #"algorithm": algorithm,
                        "seed": seed, # 0 = default
                        "encoding": encoding_type,
                        "encoding_threshold": None,
                        "field_size": field_size, # 0 = default
                        "training_set_distribuition": training_set_distribuition,
                        "tuple_size": None,
                        "bleaching_mode": None,
                        
                        # TODO change the code to iterate over the possible values of the following parameters
                        "min_score": params["min_score"],
                        "threshold": params["threshold"], 
                        "discriminator_limit": params["discriminator_limit"]
                    }        

                    training_input, expected_output, training_combined, test_input, test_expected_output, test_combined, training_set, test_set = getData(training_set_distribuition[0], training_set_distribuition[1], current_params)

                    for a_tuple_size in params["tuple_sizes"]:
                        print("Training with a tupple of size: ", a_tuple_size)

                        for bleaching in params["bleaching_modes"]:
                            print("Bleaching is set to: ", bleaching, "\n")
                            
                            for algorithm in params["algorithms"]:
                                current_params["algorithm"] = algorithm
                                
                                training_time = time.time()
                                wann = train(algorithm, training_input, expected_output, current_params, a_tuple_size, bleaching, seed)
                                training_time = time.time() - training_time

                                print("Training complete. " + str(training_time) + " seconds.")

                                in_sample_evaluation_time = time.time()
                                in_sample_performance, in_sample_additional_info =  evaluate_performance(wann, training_combined, current_params)
                                in_sample_evaluation_time = time.time() - in_sample_evaluation_time

                                # Evaluates Guilherme's wisard implementation
                                print("In-sample performance: ", float(in_sample_performance) / float(len(training_combined)))
                                print("Ones distribution: ", float(training_set["project_is_approved"].sum()) / float(len(training_set["project_is_approved"])))
                                print("Ones: ", training_set["project_is_approved"].sum(), "Zeros: ", training_set["project_is_approved"].sum() - len(training_set["project_is_approved"]))
                                print("\n")

                                out_sample_evaluation_time = time.time()
                                out_sample_performance, out_sample_additional_info =  evaluate_performance(wann, test_combined, current_params)
                                out_sample_evaluation_time = time.time() - out_sample_evaluation_time

                                print("Expected out-sample performance: ", float(out_sample_performance) / float(len(test_combined)))
                                print("Ones distribution: ", float(test_set["project_is_approved"].sum()) / float(len(test_set["project_is_approved"])))
                                print("Ones: ", test_set["project_is_approved"].sum(), "Zeros: ", (test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])), "\n\n")



                                write_result_line(algorithm, seed, encoding_type, field_size, training_set_distribuition, a_tuple_size, bleaching,
                                                 training_time, in_sample_additional_info, in_sample_evaluation_time, out_sample_additional_info,
                                                 out_sample_evaluation_time, in_sample_performance, training_combined, training_set, out_sample_performance,
                                                 test_combined, test_set, output_file)


    

In [9]:
#tuple_sizes = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 25, 30, 50, 100]
#tuple_sizes = [2]
tuple_sizes = [2, 3, 4, 5, 7, 8, 10]
#training_set_distribuitions = [[5, 5], [10, 10], [20, 20], [30, 30], [50, 50], [75, 75], [86, 14], [100, 100]]
training_set_distribuitions = [[5, 5], [10, 10], [100, 100]]#, [1000, 1000]]

param_variations = {
    "seeds": [0], # 0 = default
    #"encoding_types": ["standard", "thermometer", "onehot"],
    "encoding_types": ["thermometer", "onehot"],
    "field_sizes": [0], # 0 = default
    "training_set_distribuitions": training_set_distribuitions,
    "tuple_sizes": tuple_sizes,
    #"bleaching_modes": [False, True],
    "bleaching_modes": [True],
    #"algorithms": ["wisard","cluswisard"],
    "algorithms": ["cluswisard"],
    #"algorithms": ["wisard"],
    
    # cluswisard parameters
    "min_score": 0.1,
    "threshold": 10, 
    "discriminator_limit": 5
}

#experiment(training_set_distribuitions, tuple_sizes, [False, True])
experiment(param_variations)

Seed:  0
Input encoding type:  thermometer
Default

Training with a training set distribution of  5 5  for approved and repproved, respectively.

Total data:  182080
Total aproved:  154346
Total reproved:  27734
Percent aproved:  0.8476823374340949
Percent reproved:  0.15231766256590507 

Distribution over a random sample of 182080 observations used to get the observations to train the classifier:  0.8476823374340949
Total aproved in that sample:  154346 

Total training data:  10
Total aproved:  5
Total reproved:  5
Percent aproved:  0.5
Percent reproved:  0.5 

Total test data:  182070
Total aproved:  154341
Total reproved:  27729
Percent aproved:  0.8477014335145823
Percent reproved:  0.15229856648541773 

Training set + test set:  182080
Converting dataframe of shape  (10, 18)  to a list of binary values.
Converting dataframe of shape  (182070, 18)  to a list of binary values.
Training with a tupple of size:  2
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 

Number of observations:  182070
Predicted correctly:  123485
Predicted wrongly:  58585
Predicted zeros:  42368
Predicted ones:  139702
Zeros correct:  5756
Ones correct:  117729
Zeros wrong:  36612
Ones Wrong:  21973
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  9.747645460401053e-06  seconds.
Expected out-sample performance:  0.6782281540067007
Ones distribution:  0.8477014335145823
Ones:  154341 Zeros:  -27729 


cluswisard;0;thermometer;0;5/5;10;True;7.367134094238281e-05;8.916854858398437e-06;0.0020618438720703125;9.747645460401053e-06;1.776759386062622;10;9;0.9;5;5;1;1.0;5;4;0;0.8;182070;123485;0.6782281540067007;154341;117729;21973;0.7627850020409354;27729;5756;36612;0.20758051137797973;0;0;0;


Training with a training set distribution of  10 10  for approved and repproved, respectively.

Total data:  182080
Total aproved:  154346
Total reproved:  27734
Percent aproved:  0.8476823374340949
Percent reproved:  0.15231766256590507 

Distribution over a random 

Number of observations:  182060
Predicted correctly:  135535
Predicted wrongly:  46525
Predicted zeros:  26265
Predicted ones:  155795
Zeros correct:  3732
Ones correct:  131803
Zeros wrong:  22533
Ones Wrong:  23992
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  7.380117076734232e-06  seconds.
Expected out-sample performance:  0.744452378336812
Ones distribution:  0.8477205316928486
Ones:  154336 Zeros:  -27724 


cluswisard;0;thermometer;0;10/10;8;True;0.00012350082397460938;1.0919570922851562e-05;0.004136323928833008;7.380117076734232e-06;1.3460829257965088;20;18;0.9;10;10;2;1.0;10;8;0;0.8;182060;135535;0.744452378336812;154336;131803;23992;0.8540003628447025;27724;3732;22533;0.13461261001298513;0;0;0;

Training with a tupple of size:  10
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 0.00013971328735351562 seconds.
Number of observations:  20
Predicted correctly:  18
Predicted wrongly:  2
Predicted zeros:  12
Predicted ones:  8
Zeros cor

Number of observations:  181880
Predicted correctly:  86319
Predicted wrongly:  95561
Predicted zeros:  100977
Predicted ones:  80903
Zeros correct:  16525
Ones correct:  69794
Zeros wrong:  84452
Ones Wrong:  11109
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  1.5284350764130088e-05  seconds.
Expected out-sample performance:  0.4745931383329668
Ones distribution:  0.8480646580162745
Ones:  154246 Zeros:  -27634 


cluswisard;0;thermometer;0;100/100;7;True;0.0005643367767333984;1.6634464263916015e-05;0.005296945571899414;1.5284350764130088e-05;2.7831761837005615;200;118;0.59;100;48;30;0.48;100;70;52;0.7;181880;86319;0.4745931383329668;154246;69794;11109;0.4524849915070731;27634;16525;84452;0.5979952232756749;0;0;0;

Training with a tupple of size:  8
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 0.0005655288696289062 seconds.
Number of observations:  200
Predicted correctly:  115
Predicted wrongly:  85
Predicted zeros:  81
Predicted ones: 

Number of observations:  182070
Predicted correctly:  85338
Predicted wrongly:  96732
Predicted zeros:  100647
Predicted ones:  81423
Zeros correct:  15822
Ones correct:  69516
Zeros wrong:  84825
Ones Wrong:  11907
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  3.311451651966604e-06  seconds.
Expected out-sample performance:  0.4687098368759268
Ones distribution:  0.8477014335145823
Ones:  154341 Zeros:  -27729 


cluswisard;0;onehot;0;5/5;5;True;0.00010085105895996094;1.3279914855957032e-05;0.0029129981994628906;3.311451651966604e-06;0.6052043437957764;10;9;0.9;5;4;0;0.8;5;5;1;1.0;182070;85338;0.4687098368759268;154341;69516;11907;0.45040527144439907;27729;15822;84825;0.5705939629990263;0;0;0;

Training with a tupple of size:  7
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 0.00034928321838378906 seconds.
Number of observations:  10
Predicted correctly:  9
Predicted wrongly:  1
Predicted zeros:  4
Predicted ones:  6
Zeros correct:  4
Ones

Number of observations:  182060
Predicted correctly:  111943
Predicted wrongly:  70117
Predicted zeros:  61425
Predicted ones:  120635
Zeros correct:  9516
Ones correct:  102427
Zeros wrong:  51909
Ones Wrong:  18208
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  4.593173344161738e-06  seconds.
Expected out-sample performance:  0.6148687245962869
Ones distribution:  0.8477205316928486
Ones:  154336 Zeros:  -27724 


cluswisard;0;onehot;0;10/10;5;True;0.14296269416809082;2.2935867309570313e-05;0.005007028579711914;4.593173344161738e-06;0.8390381336212158;20;18;0.9;10;9;1;0.9;10;9;1;0.9;182060;111943;0.6148687245962869;154336;102427;18208;0.6636623989218329;27724;9516;51909;0.34324051363439617;0;0;0;

Training with a tupple of size:  7
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 0.00011801719665527344 seconds.
Number of observations:  20
Predicted correctly:  18
Predicted wrongly:  2
Predicted zeros:  12
Predicted ones:  8
Zeros correct:  1

Number of observations:  181880
Predicted correctly:  135697
Predicted wrongly:  46183
Predicted zeros:  25795
Predicted ones:  156085
Zeros correct:  3623
Ones correct:  132074
Zeros wrong:  22172
Ones Wrong:  24011
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  6.529749789971007e-06  seconds.
Expected out-sample performance:  0.7460798328568287
Ones distribution:  0.8480646580162745
Ones:  154246 Zeros:  -27634 


cluswisard;0;onehot;0;100/100;4;True;0.0006906986236572266;8.100271224975586e-06;0.0043773651123046875;6.529749789971007e-06;1.1923434734344482;200;104;0.52;100;88;84;0.88;100;16;12;0.16;181880;135697;0.7460798328568287;154246;132074;24011;0.8562555917171272;27634;3623;22172;0.13110660780198308;0;0;0;

Training with a tupple of size:  5
Bleaching is set to:  True 

Algorithm: ClusWiSARD.
Training complete. 0.0012366771697998047 seconds.
Number of observations:  200
Predicted correctly:  113
Predicted wrongly:  87
Predicted zeros:  125
Predicted ones:  7