In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import time
from datetime import datetime
#from Wisard import Wisard
import wisardpkg as wp
import math
import sys
import pickle

In [2]:
params = {}

file = open("compact_Health_Sports_50.pickle", "rb")
words_data = pickle.load(file)

In [3]:
# The preprocessing part is partialy based on the following "kernel" on Kaggle:
# https://www.kaggle.com/jgoldberg/donorschoose-eda-text-classification/notebook

def preprocess(training_dataframe, resources_dataframe):
    #print(training_dataframe.shape)
    #print(resources_dataframe.shape)
    
    #
    total_price = resources_dataframe.quantity * resources_dataframe.price
    resources_dataframe["total_price"] = total_price
    
    # dropping irrelevant columns
    resources_dataframe = resources_dataframe.drop(["description", "price"], axis=1)
    training_dataframe = training_dataframe.drop(["teacher_id"], axis=1)
    
    # grouping resources data by id
    grouped_resources_dataframe = resources_dataframe.groupby("id", as_index=False, sort=False).sum()
    grouped_resources_dataframe
    
    # merging the two dataframes
    cleaned_df = pd.merge(training_dataframe, grouped_resources_dataframe, how="inner", on=["id"])
    
    # splitting project categories
    
    #cleaned_df[['category_1','category_2']] = cleaned_df['project_subject_categories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True)
    ##print(cleaned_df['project_subject_categories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True))
    
    #cleaned_df[['subcategory_1','subcategory_2']] = cleaned_df['project_subject_subcategories'].str.replace(", Care & Hunger", "").str.split(', ', 3, expand=True)
    
    ##cleaned_df["category_1"] = cleaned_df["category_1"].fillna("Not Informed")
    #cleaned_df["category_2"] = cleaned_df["category_2"].fillna("Not Informed")
    #cleaned_df["subcategory_2"] = cleaned_df["subcategory_2"].fillna("Not Informed")
    
    cleaned_df["total_price_category"] = pd.cut(
        cleaned_df["total_price"], 
        bins=[0,100,250,500,1000,16000], 
        labels=["0-100","101-250","251-500","501-1000",">1000"]
    )
    
    cleaned_df["n_previous_projects"] = pd.cut(
        cleaned_df["teacher_number_of_previously_posted_projects"],
        bins=[-1,1,5,10,25,50,500],
        labels=['0-1','2-5','6-10','11-25','26-50','51+']
    )
    
    cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
    cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
    cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter
    
    cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")
    
    cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
    cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
    cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
    cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")
    
    #cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    cleaned_df["merged_essays"] = cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    
    # dropping more columns
    cleaned_df = cleaned_df.drop([
        "project_submitted_datetime", 
        "project_essay_1", 
        "project_essay_2", 
        "project_essay_3", 
        "project_essay_4",
        "quantity",
        "total_price",
        "teacher_number_of_previously_posted_projects"], 
        axis=1
    )
    
    #cleaned_df = pd.merge(cleaned_df, words_data, how="inner", on=["id"])
    
    print("Dim.:", cleaned_df.shape)
    
    return cleaned_df


def get_bits_string(value_order, encoding, min_lenght, length, thershold = None):
    if (value_order <= 0) or (value_order > length):
        print("Value order should be 0 < value_order <= length.")
        #sys.exit(1)
        exit()
    
    string = ""
    
    # 2 = 0001, if length = 4
    if encoding == "standard": 
        if (value_order - 1) > math.pow(2, length):
            print("It is not possible to represent " + str(value_order) + " different values with " + str(length) + " bits.")
        #sys.exit(1)
        exit()
        
        string = ("{0:0" + str(length) + "b}").format(value_order - 1)
        
    # 2 = 1100, if length = 4
    elif encoding == "thermometer": 
        value_size = math.floor(length / min_lenght) 
        string = "1" * (value_order * value_size) + "0" * (length - (value_order * value_size))
    
    # 2 = 0100, if length = 4
    elif encoding == "onehot":
        string = "0" * length
        string = list(string)
        string[value_order - 1] = "1"
        string = "".join(string)
        
    elif encoding == "threshold":
        if threshold != None:
            if value_order > threshold:
                string = "1"
            else:
                string = "0"
        else:
            print("Threshold not provided.")
            #sys.exit(1)
            exit()
    
    else:
        print("Invalid encoding: " + str(encoding) + ".\nValid encodings are \"standard\", \"thermometer\", \"onehot\" and \"threshold.\".")
        #sys.exit(1)
        exit()
        
    return string

def choose_field_size(minimum_size, expected_size):
    if minimum_size > expected_size:
        return minimum_size
    else:
        return expected_size
    

def encode_data(value_order, encoding, minimum_length, desired_length, threshold):
    return get_bits_string(
        value_order, 
        encoding, 
        minimum_length,
        choose_field_size(
            minimum_length, 
            desired_length
        ), 
        threshold
    )
    

def convert_to_bits_string(dataframe, params):
    print("Converting dataframe of shape ", dataframe.shape, " to a list of binary values.")
    
    #project_grade_category_mapping = { # 4
    #    'Grades PreK-2':  get_bits_string(1, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]),
    #    'Grades 3-5':     get_bits_string(2, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]), 
    #    'Grades 6-8':     get_bits_string(3, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"]), 
    #    'Grades 9-12':    get_bits_string(4, params["encoding"], choose_field_size(4, params["field_size"]), params["encoding_threshold"])
    #}
    print(1, params["encoding"], 4, params["field_size"], params["encoding_threshold"])
    project_grade_category_mapping = { # 4
        'Grades PreK-2':  encode_data(1, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        'Grades 3-5':     encode_data(2, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        'Grades 6-8':     encode_data(3, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        'Grades 9-12':    encode_data(4, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
    }

    #teacher_prefix_mapping = { # 6
    #    'Ms.':     get_bits_string(1, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
    #    'Mrs.':    get_bits_string(2, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
    #    'Mr.':     get_bits_string(3, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
    #    'Teacher': get_bits_string(4, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
    #    'Dr.':     get_bits_string(5, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"]), 
    #    'unknown': get_bits_string(6, params["encoding"], choose_field_size(6, params["field_size"]), params["encoding_threshold"])
    #}
    
    teacher_prefix_mapping = { # 6
        'Ms.':     encode_data(1, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        'Mrs.':    encode_data(2, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        'Mr.':     encode_data(3, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        'Teacher': encode_data(4, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        'Dr.':     encode_data(5, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        'unknown': encode_data(6, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
    }

    n_previous_projects_mapping = { # 6
        '0-1':     encode_data(1, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        '2-5':     encode_data(2, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        '6-10':    encode_data(3, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        '11-25':   encode_data(4, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        '26-50':   encode_data(5, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        '51+':     encode_data(6, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
    }

    total_price_category_mapping = { # 6
        "0-100":     encode_data(1, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        "101-250":   encode_data(2, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        "251-500":   encode_data(3, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        "501-1000":  encode_data(4, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
        ">1000":     encode_data(5, params["encoding"], 6, params["field_size"], params["encoding_threshold"]),
    }
    
    #month_mapping = { # 12
    #    "1":  get_bits_string(1, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "2":  get_bits_string(2, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "3":  get_bits_string(3, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "4":  get_bits_string(4, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "5":  get_bits_string(5, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "6":  get_bits_string(6, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "7":  get_bits_string(7, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "8":  get_bits_string(8, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "9":  get_bits_string(9, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "10": get_bits_string(10, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "11": get_bits_string(11, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"]),
    #    "12": get_bits_string(12, params["encoding"], choose_field_size(12, params["field_size"]), params["encoding_threshold"])
    #}
    
    month_mapping = { # 12
        "1":  encode_data(1, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "2":  encode_data(2, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "3":  encode_data(3, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "4":  encode_data(4, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "5":  encode_data(5, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "6":  encode_data(6, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "7":  encode_data(7, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "8":  encode_data(8, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "9":  encode_data(9, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "10": encode_data(10, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "11": encode_data(11, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
        "12": encode_data(12, params["encoding"], 12, params["field_size"], params["encoding_threshold"]),
    }
    
    quarter_mapping = { # 4
        "1": encode_data(1, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        "2": encode_data(2, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        "3": encode_data(3, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
        "4": encode_data(4, params["encoding"], 4, params["field_size"], params["encoding_threshold"]),
    }
    
    category_mapping = { # 10
        "Not Informed":          encode_data(1, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Applied Learning":      encode_data(2, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Health & Sports":       encode_data(3, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "History & Civics":      encode_data(4, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Literacy & Language":   encode_data(5, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Math & Science":        encode_data(6, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Music & The Arts":      encode_data(7, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Special Needs":         encode_data(8, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Warmth":                encode_data(9, params["encoding"], 10, params["field_size"], params["encoding_threshold"]),
        "Care & Hunger":         encode_data(9, params["encoding"], 10, params["field_size"], params["encoding_threshold"]), # Equals to warmth, because they are the same thing
    }
    
    subcategory_mapping = { # 30
        "Not Informed":          encode_data(1, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Literacy":              encode_data(2, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Performing Arts":       encode_data(3, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Applied Sciences":      encode_data(4, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Health & Wellness":     encode_data(5, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Character Education":   encode_data(6, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Early Development":     encode_data(7, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Mathematics":           encode_data(8, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Literature & Writing":  encode_data(9, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Special Needs":         encode_data(10, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "ESL":                   encode_data(11, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Health & Life Science": encode_data(12, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "College & Career Prep": encode_data(13, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Environmental Science": encode_data(14, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Other":                 encode_data(15, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Music":                 encode_data(16, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Visual Arts":           encode_data(17, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "History & Geography":   encode_data(18, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Gym & Fitness":         encode_data(19, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Warmth":                encode_data(20, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Extracurricular":       encode_data(21, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Team Sports":           encode_data(22, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Social Sciences":       encode_data(23, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Foreign Languages":     encode_data(24, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Parent Involvement":    encode_data(25, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Nutrition Education":   encode_data(26, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Community Service":     encode_data(27, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Financial Literacy":    encode_data(28, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Civics & Government":   encode_data(29, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        "Economics":             encode_data(30, params["encoding"], 30, params["field_size"], params["encoding_threshold"]),
        
    }

    school_state_mapping = { # 52 (51 possible values)
        'NV': encode_data(1, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'GA': encode_data(2, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'UT': encode_data(3, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NC': encode_data(4, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'CA': encode_data(5, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'DE': encode_data(6, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MO': encode_data(7, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'SC': encode_data(8, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'IN': encode_data(9, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'IL': encode_data(10, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'VA': encode_data(11, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'PA': encode_data(12, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NY': encode_data(13, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'FL': encode_data(14, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NJ': encode_data(15, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'TX': encode_data(16, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'LA': encode_data(17, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'ID': encode_data(18, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'OH': encode_data(19, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'OR': encode_data(20, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MD': encode_data(21, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'WA': encode_data(22, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MA': encode_data(23, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'KY': encode_data(24, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'AZ': encode_data(25, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MI': encode_data(26, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'CT': encode_data(27, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'AR': encode_data(28, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'WV': encode_data(29, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NM': encode_data(30, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'WI': encode_data(31, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MN': encode_data(32, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'OK': encode_data(33, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'AL': encode_data(34, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'TN': encode_data(35, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'IA': encode_data(36, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'KS': encode_data(37, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'CO': encode_data(38, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'DC': encode_data(39, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'WY': encode_data(40, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NH': encode_data(41, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'HI': encode_data(42, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'SD': encode_data(43, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MT': encode_data(44, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'MS': encode_data(45, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'RI': encode_data(46, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'VT': encode_data(47, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'ME': encode_data(48, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'NE': encode_data(49, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'AK': encode_data(50, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
        'ND': encode_data(51, params["encoding"], 52, params["field_size"], params["encoding_threshold"]),
    }
    
    combined_input_and_expected_output = []
    input_list = []
    expected_output_list = []
    
    n = 0
    num_bits = 0
    for index, row in dataframe.iterrows():
        #print(row)
        #if n >= 10:
        #    break
        #n = n + 1
        
        # total 128 bits
        bits_string = ""
        bits_string = project_grade_category_mapping[row["project_grade_category"]]
        bits_string = bits_string + teacher_prefix_mapping[row["teacher_prefix"]]
        bits_string = bits_string + n_previous_projects_mapping[row["n_previous_projects"]]
        bits_string = bits_string + total_price_category_mapping[row["total_price_category"]]
        
        bits_string = bits_string + month_mapping[str(row["month"])]
        bits_string = bits_string + quarter_mapping[str(row["quarter"])]
        #bits_string = bits_string + category_mapping[row["category_1"]]
        #bits_string = bits_string + category_mapping[row["category_2"]] # perhaps it is possible to ignore this one
        
        #bits_string = bits_string + subcategory_mapping[row["subcategory_1"]]
        #bits_string = bits_string + subcategory_mapping[row["subcategory_2"]]
        bits_string = bits_string + school_state_mapping[row["school_state"]]
        
        words_bits_columns = [col for col in dataframe.columns if col.startswith("col")]
        
        words_bits_string = ""
        
        for column in words_bits_columns:
            words_bits_string = words_bits_string + str(row[column])
            
        bits_string = bits_string + words_bits_string
        
        num_bits = len(bits_string)
        
        bit_int_list = [int(c) for c in bits_string]
        expected_output = str(row["project_is_approved"])
        
        input_list.append(bit_int_list)
        expected_output_list.append(expected_output)
        
        combined_input_and_expected_output.append([bit_int_list, expected_output])
        
    print("Number of bits in the input: ", num_bits)
        
    return input_list, expected_output_list, combined_input_and_expected_output

In [4]:
def loadData():
    # load data
    train_file_path = 'train.csv'
    # Test data file not considered because it doesn't contain the classes of the entries
    #test_file_path = 'test.csv'
    resources_file_path = 'resources.csv'

    # Read data and store in DataFrame
    train_data = pd.read_csv(train_file_path, sep=',')
    #test_data = pd.read_csv(test_file_path, sep=',')
    resources_data = pd.read_csv(resources_file_path, sep=',')
    
    return train_data, resources_data

# splitting the training dataset into training and test, because the official test dataset
# doesn't have the entries' classification, requiring validation with Kaggle's website
def splitData(train_data, resources_data, training_set_total_aproved, training_set_total_reproved):
    
    print("Total data: ", len(train_data))
    print("Total aproved: ", train_data["project_is_approved"].sum())
    print("Total reproved: ", len(train_data) - train_data["project_is_approved"].sum())
    print("Percent aproved: ", float(train_data["project_is_approved"].sum()) / float(len(train_data)))
    print("Percent reproved: ", 1.0 - (float(train_data["project_is_approved"].sum()) / float(len(train_data))), "\n")

    #train = train_data.sample(n=10000,random_state=200)
    train = train_data.sample(frac=1,random_state=200)
    print("Distribution over a random sample of 182080 observations used to get the observations to train the classifier: ",
          float(train["project_is_approved"].sum()) / float(len(train["project_is_approved"])))
    print("Total aproved in that sample: ", train["project_is_approved"].sum(), "\n")
    
    aproved = train[train["project_is_approved"] == 1][:training_set_total_aproved]
    reproved = train[train["project_is_approved"] == 0][:training_set_total_reproved]

    training_set = pd.concat([aproved, reproved])
    training_set = training_set.sample(frac=1, random_state=200)
    test_set = train_data.drop(training_set.index)

    print("Total training data: ", len(training_set))
    print("Total aproved: ", training_set["project_is_approved"].sum())
    print("Total reproved: ", len(training_set) - training_set["project_is_approved"].sum())
    print("Percent aproved: ", float(training_set["project_is_approved"].sum()) / float(len(training_set)))
    print("Percent reproved: ", 1.0 - (float(training_set["project_is_approved"].sum()) / float(len(training_set))), "\n")

    print("Total test data: ", len(test_set))
    print("Total aproved: ", test_set["project_is_approved"].sum())
    print("Total reproved: ", len(test_set) - test_set["project_is_approved"].sum())
    print("Percent aproved: ", float(test_set["project_is_approved"].sum()) / float(len(test_set)))
    print("Percent reproved: ", 1.0 - (float(test_set["project_is_approved"].sum()) / float(len(test_set))), "\n")

    print("Training set + test set: ", len(training_set) + len(test_set))

    return training_set, test_set


def getData(training_set_total_aproved, training_set_total_reproved, params):
    train_data, resources_data = loadData()
    train_data = pd.merge(train_data, words_data, how="inner", on=["id"])
    print("train_data shape", train_data.shape)
    training_set, test_set = splitData(train_data, resources_data, training_set_total_aproved, training_set_total_reproved)
    
    training_df = preprocess(training_set, resources_data)
    test_df = preprocess(test_set, resources_data)
    
    training_input, expected_output, training_combined = convert_to_bits_string(training_df, params)
    test_input, test_expected_output, test_combined = convert_to_bits_string(test_df, params)
    
    return training_input, expected_output, training_combined, test_input, test_expected_output, test_combined, training_set, test_set

In [5]:
# Trains using a WiSARD classifier
# Using personal implementation, without bleaching
def train(algorithm, training_input, expected_output, params, tuple_size = 2, bleaching = False, seed = 0):
    #wann = Wisard(tuple_size, 3546, bleaching)
    #wann.train(training_input, expected_output)
    
    wann = None
    
    if algorithm == "wisard":
        print("Algorithm: WiSARD.")
        wann = wp.Wisard(tuple_size, bleachingActivated=bleaching, ignoreZero=False, verbose=True)
        wann.train(training_input,expected_output)
    elif algorithm == "cluswisard":
        print("Algorithm: ClusWiSARD.")
        wann = wp.ClusWisard(tuple_size, params["min_score"], params["threshold"], params["discriminator_limit"])
        wann.train(training_input,expected_output)
    
    
    return wann

In [6]:
#Evaluates Guilherme's wisard implementation
def evaluate_performance(wann, test_data_combined, current_params):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    total_ties = 0
    ones_total_ties = 0
    zeros_total_ties = 0
    avg_time = time.time()
    
    for combined in test_data_combined:
        #prediction, tie = wann.classify([combined[0]], returnActivationDegree=True, returnConfidence=True)
        prediction = wann.classify([combined[0]])#, searchBestConfidence = False)#, returnActivationDegree=True, returnConfidence=True)
        #print(prediction)
        prediction = prediction[0]#["class"]
        tie = 0
        
        if prediction == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        
        expected = combined[1]
        #print(prediction, expected)
        if prediction == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction == "0":
                zeros_correct = zeros_correct + 1
            elif prediction == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction == "1":
                ones_wrong = ones_wrong + 1
                
        if tie:
            total_ties = total_ties + 1
            if expected == "0":
                zeros_total_ties = zeros_total_ties + 1
            elif expected == "1":
                ones_total_ties = ones_total_ties + 1
                
    avg_time = float(time.time() - avg_time) / float(len(test_data_combined)) 
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    print("Total ties: ", total_ties)
    print("Zeros total ties: ", zeros_total_ties)
    print("Ones total ties: ", ones_total_ties)
    print("Avg. Time: ", avg_time, " seconds.")
    return correct_predictions, [
        len(test_data_combined), correct_predictions, wrong_predictions, zeros_predicted, ones_predicted,
        zeros_correct, ones_correct, zeros_wrong, ones_wrong, total_ties, zeros_total_ties, ones_total_ties,
        avg_time
    ]

#Evaluates Firminos's wisard implementation
def evaluate_performance2(w, test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    avg_time = time.time()
    
    for combined in test_data_combined:
        prediction = w.predict([combined[0]])
        if prediction[0] == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction[0] == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction[0] == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction[0] == "0":
                zeros_correct = zeros_correct + 1
            elif prediction[0] == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction[0] == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction[0] == "1":
                ones_wrong = ones_wrong + 1
                
    avg_time = float(time.time() - avg_time) / float(len(test_data_combined)) 
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    print("Avg. Time: ", avg_time, " seconds.")
    return correct_predictions, [
        len(test_data_combined), correct_predictions, wrong_predictions, zeros_predicted, ones_predicted,
        zeros_correct, ones_correct, zeros_wrong, ones_wrong, avg_time
    ]

In [7]:
def write_result_line(algorithm, seed, encoding_type, field_size, training_set_distribuition, a_tuple_size, bleaching,
                     training_time, in_sample_additional_info, in_sample_evaluation_time, out_sample_additional_info,
                     out_sample_evaluation_time, in_sample_performance, training_combined, training_set, out_sample_performance,
                     test_combined, test_set, output_file):
    line = ""
    line_contents = [
        # Algorithm
        str(algorithm) + ";",
        # Seed
        str(seed) + ";",
        # Encoding type
        str(encoding_type) + ";",
        # Field size
        str(field_size) + ";",
        # training / test
        str(training_set_distribuition[0]) + "/" + str(training_set_distribuition[1]) + ";",
        # tuple size
        str(a_tuple_size) + ";",
        # bleaching active or not
        str(bleaching) + ";",
        # training time in seconds
        str(training_time) + ";",
        # in sample pattern average evaluation time in seconds
        str(in_sample_additional_info[12]) + ";",
        # in sample total evaluation time in seconds
        str(in_sample_evaluation_time) + ";",
        # out sample pattern average evaluation time in seconds
        str(out_sample_additional_info[12]) + ";",
        # out sample total evaluation time in seconds
        str(out_sample_evaluation_time) + ";",

        # total traning observations
        str(training_set_distribuition[0] + training_set_distribuition[1]) + ";",
        # total of correct prediction in the training dataset
        str(in_sample_performance) + ";",
        # percentage of right answers
        str(float(in_sample_performance) / float(len(training_combined))) + ";",
        # total approved in the training dataset
        str(training_set["project_is_approved"].sum()) + ";",
        # total approved correctly predicted in the training dataset
        str(in_sample_additional_info[6]) + ";",
        # total approved wrongly predicted in the training dataset
        str(in_sample_additional_info[8]) + ";",
        # percentage of approved projects predicted correctly in the training dataset
        str(float(in_sample_additional_info[6]) / float(training_set["project_is_approved"].sum())) + ";",
        # total reproved in the training dataset
        str((training_set["project_is_approved"].sum() - len(training_set["project_is_approved"])) * -1) + ";",
        # total reproved correctly predicted in the training dataset
        str(in_sample_additional_info[5]) + ";",
        # total reproved wrongly predicted in the training dataset
        str(in_sample_additional_info[7]) + ";",
        # percentage of reproved projects predicted correctly in the training dataset
        str(float(in_sample_additional_info[5]) / float((training_set["project_is_approved"].sum() - len(training_set["project_is_approved"])) * -1)) + ";",


        # total test observations
        str(len(test_set["project_is_approved"])) + ";",
        # total of correct prediction in the test dataset
        str(out_sample_performance) + ";",
        # percentage of right answers
        str(float(out_sample_performance) / float(len(test_combined))) + ";",
        # total approved in the test dataset
        str(test_set["project_is_approved"].sum()) + ";",
        # total approved correctly predicted in the test dataset
        str(out_sample_additional_info[6]) + ";",
        # total approved wrongly predicted in the test dataset
        str(out_sample_additional_info[8]) + ";",
        # percentage of approved projects predicted correctly in the test dataset
        str(float(out_sample_additional_info[6]) / float(test_set["project_is_approved"].sum())) + ";",
        # total reproved in the test dataset
        str((test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])) * -1) + ";",
        # total reproved correctly predicted in the training dataset
        str(out_sample_additional_info[5]) + ";",
        # total reproved wrongly predicted in the training dataset
        str(out_sample_additional_info[7]) + ";",
        # percentage of reproved projects predicted correctly in the training dataset
        str(float(out_sample_additional_info[5]) / float((test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])) * -1)) + ";",

        # total ties
        str(out_sample_additional_info[9]) + ";",
        # total ties when prediction should have been zero
        str(out_sample_additional_info[10]) + ";",
        # total ties when prediction should have been one
        str(out_sample_additional_info[11]) + ";",

        "\n",
        #str() + ";",
    ]

    for content in line_contents:
        #print(content)
        line = line + content

    print(line)

    file = open(output_file, "a+")
    file.write(line)
    file.close()

In [8]:
def experiment(params): #training_set_distribuitions, tuple_sizes, bleaching_mode = [False]):
    output_file = "insights/results_experiment" + datetime.now().strftime('%Y%m%d%H%M%S') + ".csv"
    file = open(output_file, "w")
    file.write("algorithm;seed;encoding;field_size;data_distribution;tuple_size;bleaching_active;total_training_time;avg_in_sample_evaluation_time;total_in_sample_evaluation_time;avg_out_sample_evaluation_time;total_out_sample_evaluation_time;total_training_data;total_correct_training;" +
               "percent_correct_training;total_approved_training;correctly_approved_training;wrongly_approved_training;" +
               "percent_approved_correctly_training;total_reproved_training;correctly_reproved_training;" +
               "wrongly_reproved_training;percent_reproved_correctly_training;total_test_data;total_correct_test;" +
               "percent_correct_test;total_approved_test;correctly_approved_test;wrongly_approved_test;" +
               "percent_approved_correctly_test;total_reproved_test;correctly_reproved_test;" +
               "wrongly_reproved_test;percent_reproved_correctly_test;total_ties;ties_for_zeros;ties_for_ones\n"
              )
    file.close()
    
    for seed in params["seeds"]:
        print("Seed: ", seed)
        
        for encoding_type in params["encoding_types"]:
            print("Input encoding type: ", encoding_type)
            
            for field_size in params["field_sizes"]:
                print("Input field sizes: " + str(field_size) if field_size != 0 else "Default")
                
                for training_set_distribuition in params["training_set_distribuitions"]:
                    print("\nTraining with a training set distribution of ", 
                          training_set_distribuition[0], training_set_distribuition[1],
                          " for approved and repproved, respectively.\n")
                    
                    
                    current_params = {
                        #"algorithm": algorithm,
                        "seed": seed, # 0 = default
                        "encoding": encoding_type,
                        "encoding_threshold": None,
                        "field_size": field_size, # 0 = default
                        "training_set_distribuition": training_set_distribuition,
                        "tuple_size": None,
                        "bleaching_mode": None,
                        
                        # TODO change the code to iterate over the possible values of the following parameters
                        "min_score": params["min_score"],
                        "threshold": params["threshold"], 
                        "discriminator_limit": params["discriminator_limit"]
                    }        

                    training_input, expected_output, training_combined, test_input, test_expected_output, test_combined, training_set, test_set = getData(training_set_distribuition[0], training_set_distribuition[1], current_params)

                    for a_tuple_size in params["tuple_sizes"]:
                        print("Training with a tupple of size: ", a_tuple_size)

                        for bleaching in params["bleaching_modes"]:
                            print("Bleaching is set to: ", bleaching, "\n")
                            
                            for algorithm in params["algorithms"]:
                                current_params["algorithm"] = algorithm
                                
                                training_time = time.time()
                                wann = train(algorithm, training_input, expected_output, current_params, a_tuple_size, bleaching, seed)
                                training_time = time.time() - training_time

                                print("Training complete. " + str(training_time) + " seconds.")

                                in_sample_evaluation_time = time.time()
                                in_sample_performance, in_sample_additional_info =  evaluate_performance(wann, training_combined, current_params)
                                in_sample_evaluation_time = time.time() - in_sample_evaluation_time

                                # Evaluates Guilherme's wisard implementation
                                print("In-sample performance: ", float(in_sample_performance) / float(len(training_combined)))
                                print("Ones distribution: ", float(training_set["project_is_approved"].sum()) / float(len(training_set["project_is_approved"])))
                                print("Ones: ", training_set["project_is_approved"].sum(), "Zeros: ", training_set["project_is_approved"].sum() - len(training_set["project_is_approved"]))
                                print("\n")

                                out_sample_evaluation_time = time.time()
                                out_sample_performance, out_sample_additional_info =  evaluate_performance(wann, test_combined, current_params)
                                out_sample_evaluation_time = time.time() - out_sample_evaluation_time

                                print("Expected out-sample performance: ", float(out_sample_performance) / float(len(test_combined)))
                                print("Ones distribution: ", float(test_set["project_is_approved"].sum()) / float(len(test_set["project_is_approved"])))
                                print("Ones: ", test_set["project_is_approved"].sum(), "Zeros: ", (test_set["project_is_approved"].sum() - len(test_set["project_is_approved"])), "\n\n")



                                write_result_line(algorithm, seed, encoding_type, field_size, training_set_distribuition, a_tuple_size, bleaching,
                                                 training_time, in_sample_additional_info, in_sample_evaluation_time, out_sample_additional_info,
                                                 out_sample_evaluation_time, in_sample_performance, training_combined, training_set, out_sample_performance,
                                                 test_combined, test_set, output_file)


    

In [9]:
#tuple_sizes = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 25, 30, 50, 100]
#tuple_sizes = [2]
tuple_sizes = [2, 3, 4, 5, 7, 8, 10]
#training_set_distribuitions = [[5, 5], [10, 10], [20, 20], [30, 30], [50, 50], [75, 75], [86, 14], [100, 100]]
training_set_distribuitions = [[5, 5], [10, 10], [100, 100]]#, [1000, 1000]]

param_variations = {
    "seeds": [0], # 0 = default
    #"encoding_types": ["standard", "thermometer", "onehot"],
    "encoding_types": ["thermometer", "onehot"],
    "field_sizes": [52], # 0 = default
    "training_set_distribuitions": training_set_distribuitions,
    "tuple_sizes": tuple_sizes,
    #"bleaching_modes": [False, True],
    "bleaching_modes": [True],
    #"algorithms": ["wisard","cluswisard"],
    "algorithms": ["wisard"],
    #"algorithms": ["wisard"],
    
    # cluswisard parameters
    "min_score": 0.1,
    "threshold": 10, 
    "discriminator_limit": 5
}

#experiment(training_set_distribuitions, tuple_sizes, [False, True])
experiment(param_variations)

Seed:  0
Input encoding type:  thermometer
Input field sizes: 52

Training with a training set distribution of  5 5  for approved and repproved, respectively.

train_data shape (21754, 67)
Total data:  21754
Total aproved:  18442
Total reproved:  3312
Percent aproved:  0.8477521375379241
Percent reproved:  0.1522478624620759 

Distribution over a random sample of 182080 observations used to get the observations to train the classifier:  0.8477521375379241
Total aproved in that sample:  18442 

Total training data:  10
Total aproved:  5
Total reproved:  5
Percent aproved:  0.5
Percent reproved:  0.5 

Total test data:  21744
Total aproved:  18437
Total reproved:  3307
Percent aproved:  0.8479120676968359
Percent reproved:  0.1520879323031641 

Training set + test set:  21754
Dim.: (10, 65)
Dim.: (21744, 65)
Converting dataframe of shape  (10, 65)  to a list of binary values.
1 thermometer 4 52 None
Number of bits in the input:  414
Converting dataframe of shape  (21744, 65)  to a list o

Number of observations:  21744
Predicted correctly:  11206
Predicted wrongly:  10538
Predicted zeros:  10197
Predicted ones:  11547
Zeros correct:  1483
Ones correct:  9723
Zeros wrong:  8714
Ones Wrong:  1824
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.00011242993006029052  seconds.
Expected out-sample performance:  0.5153605592347315
Ones distribution:  0.8479120676968359
Ones:  18437 Zeros:  -3307 


wisard;0;thermometer;52;5/5;10;True;0.0009870529174804688;9.660720825195312e-05;0.00521397590637207;0.00011242993006029052;2.4458837509155273;10;10;1.0;5;5;0;1.0;5;5;0;1.0;21744;11206;0.5153605592347315;18437;9723;1824;0.5273634539241742;3307;1483;8714;0.44844269730873904;0;0;0;


Training with a training set distribution of  10 10  for approved and repproved, respectively.

train_data shape (21754, 67)
Total data:  21754
Total aproved:  18442
Total reproved:  3312
Percent aproved:  0.8477521375379241
Percent reproved:  0.1522478624620759 

Distribution over a 

Number of observations:  21734
Predicted correctly:  7713
Predicted wrongly:  14021
Predicted zeros:  15721
Predicted ones:  6013
Zeros correct:  2501
Ones correct:  5212
Zeros wrong:  13220
Ones Wrong:  801
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.00010390596292358297  seconds.
Expected out-sample performance:  0.35488175209349404
Ones distribution:  0.8480721450262262
Ones:  18432 Zeros:  -3302 


wisard;0;thermometer;52;10/10;8;True;0.0018520355224609375;0.00010305643081665039;0.010468482971191406;0.00010390596292358297;2.2670698165893555;20;20;1.0;10;10;0;1.0;10;10;0;1.0;21734;7713;0.35488175209349404;18432;5212;801;0.2827690972222222;3302;2501;13220;0.7574197456087219;0;0;0;

Training with a tupple of size:  10
Bleaching is set to:  True 

Algorithm: WiSARD.
Training complete. 0.0018656253814697266 seconds.
Number of observations:  20
Predicted correctly:  20
Predicted wrongly:  0
Predicted zeros:  10
Predicted ones:  10
Zeros correct:  10
Ones correct

Number of observations:  21554
Predicted correctly:  12757
Predicted wrongly:  8797
Predicted zeros:  8925
Predicted ones:  12629
Zeros correct:  1670
Ones correct:  11087
Zeros wrong:  7255
Ones Wrong:  1542
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.00011555345688722809  seconds.
Expected out-sample performance:  0.5918622993411896
Ones distribution:  0.8509789366242925
Ones:  18342 Zeros:  -3212 


wisard;0;thermometer;52;100/100;7;True;0.01358342170715332;0.00015866518020629883;0.034789085388183594;0.00011555345688722809;2.500969171524048;200;183;0.915;100;95;12;0.95;100;88;5;0.88;21554;12757;0.5918622993411896;18342;11087;1542;0.6044597099552939;3212;1670;7255;0.5199252801992528;0;0;0;

Training with a tupple of size:  8
Bleaching is set to:  True 

Algorithm: WiSARD.
Training complete. 0.012630939483642578 seconds.
Number of observations:  200
Predicted correctly:  179
Predicted wrongly:  21
Predicted zeros:  87
Predicted ones:  113
Zeros correct:  83
O

Number of observations:  21744
Predicted correctly:  8155
Predicted wrongly:  13589
Predicted zeros:  15200
Predicted ones:  6544
Zeros correct:  2459
Ones correct:  5696
Zeros wrong:  12741
Ones Wrong:  848
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.00011894545456870151  seconds.
Expected out-sample performance:  0.3750459896983076
Ones distribution:  0.8479120676968359
Ones:  18437 Zeros:  -3307 


wisard;0;onehot;52;5/5;5;True;0.0009987354278564453;0.00011878013610839843;0.007203578948974609;0.00011894545456870151;2.588076114654541;10;10;1.0;5;5;0;1.0;5;5;0;1.0;21744;8155;0.3750459896983076;18437;5696;848;0.30894397136193524;3307;2459;12741;0.743574236468098;0;0;0;

Training with a tupple of size:  7
Bleaching is set to:  True 

Algorithm: WiSARD.
Training complete. 0.0009222030639648438 seconds.
Number of observations:  10
Predicted correctly:  10
Predicted wrongly:  0
Predicted zeros:  5
Predicted ones:  5
Zeros correct:  5
Ones correct:  5
Zeros wrong: 

Number of observations:  21734
Predicted correctly:  11784
Predicted wrongly:  9950
Predicted zeros:  9650
Predicted ones:  12084
Zeros correct:  1501
Ones correct:  10283
Zeros wrong:  8149
Ones Wrong:  1801
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.0001357632660982064  seconds.
Expected out-sample performance:  0.5421919573019233
Ones distribution:  0.8480721450262262
Ones:  18432 Zeros:  -3302 


wisard;0;onehot;52;10/10;4;True;0.0015974044799804688;0.00011767148971557617;0.004559993743896484;0.0001357632660982064;2.952467203140259;20;20;1.0;10;10;0;1.0;10;10;0;1.0;21734;11784;0.5421919573019233;18432;10283;1801;0.5578884548611112;3302;1501;8149;0.45457298606904906;0;0;0;

Training with a tupple of size:  5
Bleaching is set to:  True 

Algorithm: WiSARD.
Training complete. 0.0015568733215332031 seconds.
Number of observations:  20
Predicted correctly:  20
Predicted wrongly:  0
Predicted zeros:  10
Predicted ones:  10
Zeros correct:  10
Ones correct:  10
Z

Number of observations:  21554
Predicted correctly:  12623
Predicted wrongly:  8931
Predicted zeros:  9387
Predicted ones:  12167
Zeros correct:  1834
Ones correct:  10789
Zeros wrong:  7553
Ones Wrong:  1378
Total ties:  0
Zeros total ties:  0
Ones total ties:  0
Avg. Time:  0.00019607928220223791  seconds.
Expected out-sample performance:  0.5856453558504222
Ones distribution:  0.8509789366242925
Ones:  18342 Zeros:  -3212 


wisard;0;onehot;52;100/100;3;True;0.012637853622436523;0.00018510103225708008;0.0451202392578125;0.00019607928220223791;4.234862804412842;200;148;0.74;100;75;27;0.75;100;73;25;0.73;21554;12623;0.5856453558504222;18342;10789;1378;0.5882128448369861;3212;1834;7553;0.5709838107098381;0;0;0;

Training with a tupple of size:  4
Bleaching is set to:  True 

Algorithm: WiSARD.
Training complete. 0.020827293395996094 seconds.
Number of observations:  200
Predicted correctly:  143
Predicted wrongly:  57
Predicted zeros:  93
Predicted ones:  107
Zeros correct:  68
Ones co