In [48]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from WiSARD import WiSARD
from Wisard import Wisard

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *

import json


In [2]:
# load data
train_file_path = 'train.csv'
test_file_path = 'test.csv'
resources_file_path = 'resources.csv'

# Read data and store in DataFrame
train_data = pd.read_csv(train_file_path, sep=',')
#test_data = pd.read_csv(test_file_path, sep=',')
resources_data = pd.read_csv(resources_file_path, sep=',')

In [3]:
# splitting the training dataset int training (~80%) and test (~20%), because the official test dataset
# doesn't have the the entries' classification, requiring validation with Kaggle's website

#msk = np.random.rand(len(train_data)) < 0.8
#train = train_data[msk]
#test_data = train_data[~msk]
#train_data = train

print(train_data["project_is_approved"].sum())
print("Percent aproved: ", float(train_data["project_is_approved"].sum()) / float(len(train_data)), "\n")
#print(train_data.groupby('project_subject_categories')['project_subject_categories'].nunique())

train = train_data.sample(frac=0.8,random_state=200)
test_data = train_data.drop(train.index)

train_data = train

print(len(train_data))
print(len(test_data))

154346
Percent aproved:  0.8476823374340949 

145664
36416


In [4]:
# The preprocessing part is partialy based on the following "kernel" on Kaggle:
# https://www.kaggle.com/jgoldberg/donorschoose-eda-text-classification/notebook

def preprocess(training_dataframe, resources_dataframe):
    print(training_dataframe.shape)
    print(resources_dataframe.shape)
    
    #
    total_price = resources_dataframe.quantity * resources_dataframe.price
    resources_dataframe["total_price"] = total_price
    
    # dropping irrelevant columns
    resources_dataframe = resources_dataframe.drop(["description", "price"], axis=1)
    training_dataframe = training_dataframe.drop(["teacher_id"], axis=1)
    
    # grouping resources data by id
    grouped_resources_dataframe = resources_dataframe.groupby("id", as_index=False, sort=False).sum()
    grouped_resources_dataframe
    
    # merging the two dataframes
    cleaned_df = pd.merge(training_dataframe, grouped_resources_dataframe, how="inner", on=["id"])
    
    # splitting project categories
    
    cleaned_df[['category_1','category_2', "category_3"]] = cleaned_df['project_subject_categories'].str.split(', ', 3, expand=True)
    
    #cleaned_df["category_1"] = cleaned_df["category_1"].fillna("Not Informed")
    cleaned_df["category_2"] = cleaned_df["category_2"].fillna("Not Informed")
    
    cleaned_df["total_price_category"] = pd.cut(
        cleaned_df["total_price"], 
        bins=[0,100,250,500,1000,16000], 
        labels=["0-100","101-250","251-500","501-1000",">1000"]
    )
    
    cleaned_df["n_previous_projects"] = pd.cut(
        cleaned_df["teacher_number_of_previously_posted_projects"],
        bins=[-1,1,5,10,25,50,500],
        labels=['0-1','2-5','6-10','11-25','26-50','51+']
    )
    
    cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
    cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
    cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter
    
    cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")
    
    cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
    cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
    cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
    cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")
    
    #cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    cleaned_df["merged_essays"] = cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    
    # dropping more columns
    cleaned_df = cleaned_df.drop([
        "project_submitted_datetime", 
        "project_essay_1", 
        "project_essay_2", 
        "project_essay_3", 
        "project_essay_4",
        "quantity",
        "total_price",
        "teacher_number_of_previously_posted_projects"], 
        axis=1
    )
    
    return cleaned_df

# returns a list with the following format
# [
#     ["001101...010101", 1]
#     ["001111...000001", 1]
#     ["101001...111100", 0]
# ]
def convert_to_bits_string(dataframe):
    print(dataframe.shape)
    
    project_grade_category_mapping = {
        'Grades PreK-2':"00000000000000000000000000000000000000000000000000", 
        'Grades 3-5':"10000000000000000000000000000000000000000000000000", 
        'Grades 6-8':"11000000000000000000000000000000000000000000000000", 
        'Grades 9-12':"11100000000000000000000000000000000000000000000000"
    }

    teacher_prefix_mapping = {
        'Ms.':"11110000000000000000000000000000000000000000000000", 
        'Mrs.':"11111000000000000000000000000000000000000000000000", 
        'Mr.':"11111100000000000000000000000000000000000000000000", 
        'Teacher':"11111110000000000000000000000000000000000000000000", 
        'Dr.':"11111111000000000000000000000000000000000000000000", 
        'unknown':"11111111100000000000000000000000000000000000000000"
    }

    n_previous_projects_mapping = {
        '0-1':"11111111110000000000000000000000000000000000000000",
        '2-5':"11111111111000000000000000000000000000000000000000",
        '6-10':"11111111111100000000000000000000000000000000000000",
        '11-25':"11111111111110000000000000000000000000000000000000",
        '26-50':"11111111111111000000000000000000000000000000000000",
        '51+':"11111111111111100000000000000000000000000000000000"
    }

    total_price_category_mapping = {
        "0-100":"11111111111111110000000000000000000000000000000000",
        "101-250":"11111111111111111000000000000000000000000000000000",
        "251-500":"11111111111111111100000000000000000000000000000000",
        "501-1000":"11111111111111111110000000000000000000000000000000",
        ">1000":"11111111111111111111000000000000000000000000000000"
    }
    
    month_mapping = {
        "1":"11111111111111111111100000000000000000000000000000",
        "2":"11111111111111111111110000000000000000000000000000",
        "3":"11111111111111111111111000000000000000000000000000",
        "4":"11111111111111111111111100000000000000000000000000",
        "5":"11111111111111111111111110000000000000000000000000",
        "6":"11111111111111111111111111000000000000000000000000",
        "7":"11111111111111111111111111100000000000000000000000",
        "8":"11111111111111111111111111110000000000000000000000",
        "9":"11111111111111111111111111111000000000000000000000",
        "10":"11111111111111111111111111111100000000000000000000",
        "11":"11111111111111111111111111111110000000000000000000",
        "12":"11111111111111111111111111111111000000000000000000"
    }
    
    quarter_mapping = {
        "1":"11111111111111111111111111111111100000000000000000",
        "2":"11111111111111111111111111111111110000000000000000",
        "3":"11111111111111111111111111111111111000000000000000",
        "4":"11111111111111111111111111111111111111111111100000"
    }
    
    category_mapping = {
        "Not Informed":"11111111111111111111111111111111111100000000000000",
        "Applied Learning":"11111111111111111111111111111111111110000000000000",
        "Health & Sports":"11111111111111111111111111111111111111000000000000",
        "History & Civics":"11111111111111111111111111111111111111100000000000",
        "Literacy & Language":"11111111111111111111111111111111111111110000000000",
        "Math & Science":"11111111111111111111111111111111111111111000000000",
        "Music & The Arts":"11111111111111111111111111111111111111111100000000",
        "Special Needs":"11111111111111111111111111111111111111111110000000",
        "Warmth":"11111111111111111111111111111111111111111111000000",
        "Care & Hunger":"11111111111111111111111111111111111111111111000000", # Equals to warmth, because they are the same thing
    }
    
    combined_input_and_expected_output = []
    input_list = []
    expected_output_list = []
    
    n = 0
    for index, row in dataframe.iterrows():
        #print(row)
        #if n >= 10:
        #    break
        #n = n + 1
        
        bits_string = ""
        bits_string = project_grade_category_mapping[row["project_grade_category"]]
        bits_string = bits_string + teacher_prefix_mapping[row["teacher_prefix"]]
        bits_string = bits_string + n_previous_projects_mapping[row["n_previous_projects"]]
        bits_string = bits_string + total_price_category_mapping[row["total_price_category"]]
        
        bits_string = bits_string + month_mapping[str(row["month"])]
        bits_string = bits_string + quarter_mapping[str(row["quarter"])]
        bits_string = bits_string + category_mapping[row["category_1"]]
        bits_string = bits_string + category_mapping[row["category_2"]]
        
        bit_int_list = [int(c) for c in bits_string]
        expected_output = str(row["project_is_approved"])
        
        input_list.append(bit_int_list)
        expected_output_list.append(expected_output)
        
        combined_input_and_expected_output.append([bit_int_list, expected_output])
        
    return input_list, expected_output_list, combined_input_and_expected_output



In [5]:
training_df = preprocess(train_data, resources_data)
training_df

(145664, 16)
(1541272, 4)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p137962,Mr.,NJ,Grades 9-12,"Applied Learning, Music & The Arts","College & Career Prep, Visual Arts",Video Game Design Startup Success,"My students need (3) Amazon Fire Tablets, (3) ...",1,Applied Learning,Music & The Arts,,251-500,11-25,5,2,By the end of the school year I would like all...
1,p118981,Mrs.,NY,Grades PreK-2,"Math & Science, Literacy & Language","Applied Sciences, Literacy","Let the \""Osmo Games\"" Begin",My students need the Osmo Genuis Kit and the O...,1,Math & Science,Literacy & Language,,251-500,2-5,8,3,I teach kindergarten students at a high-povert...
2,p055582,Mrs.,IL,Grades PreK-2,Literacy & Language,"Literacy, Literature & Writing",Write On!,"My students need chart paper, copy paper, colo...",1,Literacy & Language,Not Informed,,101-250,6-10,7,3,I teach a group of hardworking kindergarteners...
3,p027636,Ms.,MI,Grades 3-5,Literacy & Language,Literacy,Nonfiction & Informational Texts,My students need more nonfiction books that wi...,1,Literacy & Language,Not Informed,,101-250,26-50,8,3,"Due to my students current circumstances, many..."
4,p198027,Mrs.,GA,Grades 3-5,Math & Science,Environmental Science,SCREAM Lab Brings Google Expeditions to Peeples!,My students need a Google Expeditions Kit to t...,1,Math & Science,Not Informed,,>1000,2-5,12,4,"\""When I grow up, I want to solve problems tha..."
5,p019556,Mrs.,OH,Grades PreK-2,Applied Learning,"Character Education, Early Development","Rain, Rain, Go Away! Indoor Recess to Save th...",My students need group and partner games and a...,1,Applied Learning,Not Informed,,101-250,0-1,8,3,"I teach in a Title I, low income school, where..."
6,p080293,Mrs.,TX,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Enhancing Technology,My students need the ability to engage with te...,1,Literacy & Language,Math & Science,,251-500,0-1,10,4,Elementary Kindergarten students are amazing! ...
7,p028305,Mrs.,KY,Grades 3-5,Literacy & Language,Literature & Writing,Wiggle and Write!,My students need exercise balls to replace the...,1,Literacy & Language,Not Informed,,101-250,0-1,1,1,As a teacher in a low-income/high poverty scho...
8,p174803,Mr.,PA,Grades 6-8,Literacy & Language,"Literacy, Literature & Writing",Reading Our Way to Media Literacy!,My students need new novels to help increase t...,1,Literacy & Language,Not Informed,,101-250,26-50,3,1,My average day in the classroom begins when I ...
9,p039630,Ms.,IL,Grades 9-12,History & Civics,"Civics & Government, Social Sciences",SOSS! Supply Our Social Sciences!\r\n,My students need markers and color pencils for...,1,History & Civics,Not Informed,,251-500,0-1,6,2,Inspiring is the first adjective that comes to...


In [6]:
test_df = preprocess(test_data, resources_data)
test_df

(36416, 16)
(1541272, 5)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p036502,Ms.,NV,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,My students need 6 Ipod Nano's to create and d...,1,Literacy & Language,Not Informed,,501-1000,26-50,11,4,Most of my kindergarten students come from low...
1,p013780,Mr.,CA,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students need a water filtration system for...,1,Health & Sports,Not Informed,,501-1000,26-50,8,3,My students are athletes and students who are ...
2,p191410,Mrs.,IL,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,My students need 2 youth sized reclining chair...,1,Literacy & Language,Not Informed,,101-250,11-25,9,3,I teach first grade in a small farming town in...
3,p173555,Mrs.,NY,Grades 3-5,Literacy & Language,Literature & Writing,Extra! Extra! Read all about it!! We love to ...,"My students need good books, with life lessons...",0,Literacy & Language,Not Informed,,501-1000,2-5,11,4,"Each day my fifth graders walk into our \""home..."
4,p070918,Mr.,TX,Grades PreK-2,"Literacy & Language, Math & Science","Literature & Writing, Mathematics",Mini Devices With Many Opportunities,"My students need four iPad minis, and four pro...",1,Literacy & Language,Math & Science,,>1000,0-1,6,2,"\""You can dream, create, design, and build the..."
5,p107356,Ms.,UT,Grades 3-5,"Math & Science, Literacy & Language","Environmental Science, Literature & Writing",Can you hear me now?,My students need earbud headphones to use in t...,1,Math & Science,Literacy & Language,,101-250,6-10,10,4,Our classroom is filled with bright smiles and...
6,p048313,Mrs.,MD,Grades PreK-2,"Literacy & Language, History & Civics","Literacy, Social Sciences",My Big World Readers to Help Us Explore,My students need My Big WOrld Scholastic reade...,1,Literacy & Language,History & Civics,,101-250,0-1,8,3,My Pre K students are so excited to come to sc...
7,p211511,Mrs.,NC,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Kindergarteners LOVE technology!,My students need a set of five kindle fire tab...,1,Literacy & Language,Math & Science,,251-500,0-1,12,4,I have taught at a wonderful school in North C...
8,p130035,Ms.,TX,Grades 9-12,Math & Science,Mathematics,Plotting flight patterns with Rockets!!!,My students need materials to do build rockets...,0,Math & Science,Not Informed,,251-500,0-1,4,2,My tenth grade is comprised of 30 amazing stud...
9,p207223,Mr.,CA,Grades 9-12,"Special Needs, Music & The Arts","Special Needs, Visual Arts",Learning in the Community through Art,My students need an opportunity to express the...,1,Special Needs,Music & The Arts,,251-500,11-25,1,1,The Community Inclusion Program provides educa...


In [7]:
print(training_df["category_1"].unique())
print(training_df["category_2"].unique())
print(training_df["category_3"].unique())
print(test_df["category_1"].unique())
print(test_df["category_2"].unique())
print(test_df["category_3"].unique())

print(test_df["month"].unique())
print(test_df["quarter"].unique())

['Applied Learning' 'Math & Science' 'Literacy & Language'
 'History & Civics' 'Warmth' 'Health & Sports' 'Music & The Arts'
 'Special Needs']
['Music & The Arts' 'Literacy & Language' 'Not Informed' 'Math & Science'
 'Care & Hunger' 'Special Needs' 'History & Civics' 'Applied Learning'
 'Health & Sports' 'Warmth']
[None 'Care & Hunger']
['Literacy & Language' 'Health & Sports' 'Math & Science' 'Special Needs'
 'Applied Learning' 'Music & The Arts' 'History & Civics' 'Warmth']
['Not Informed' 'Math & Science' 'Literacy & Language' 'History & Civics'
 'Music & The Arts' 'Special Needs' 'Applied Learning' 'Care & Hunger'
 'Health & Sports' 'Warmth']
[None 'Care & Hunger']
[11  8  9  6 10 12  4  1  3  7  2  5]
[4 3 2 1]


In [84]:
tokenizer = RegexpTokenizer(r'[a-z]+')
test_df["words_list"] = None
#training_df["words_list"] = training_df["words_list"].astype([])
words_universe_set = set()
count = 0

for row in test_df.index:
    text = test_df.loc[row, "merged_essays"]
    words_list = [word for word in tokenizer.tokenize(text.lower()) if word not in stopwords.words('english')]
    
    words_universe_set.update(words_list)
    test_df.loc[row , "words_list"] = json.dumps(words_list)
    #print(words_list)
    #print(words_universe_set)
    count = count + 1
    if count % 100 == 0:
        print(count)
        print(len(words_universe_set), "\n")
        #break
        
print(len(words_universe_set))
#print(words_universe_set)
#print(test_df["words_list"][:10])

#print(test_df["words_list"][:1])
#print(json.loads((test_df["words_list"][1])))
#a_list = json.loads(test_df["words_list"][1])
#print("\n\n\n", a_list)
#print("\n\n\n", a_list[3])

100
2804 

200
4151 

300
5244 

400
5979 

500
6701 

600
7232 

700
7655 

800
8201 

900
8620 

1000
8979 

1100
9402 

1200
9771 

1300
10062 

1400
10439 

1500
10721 

1600
10999 

1700
11276 

1800
11551 

1900
11815 

2000
12059 

2100
12371 

2200
12597 

2300
12792 

2400
12995 

2500
13207 

2600
13446 

2700
13673 

2800
13913 

2900
14140 

3000
14331 

3100
14508 

3200
14681 

3300
14864 

3400
15030 

3500
15208 

3600
15375 

3700
15534 

3800
15680 

3900
15855 

4000
16039 

4100
16182 

4200
16328 

4300
16479 

4400
16643 

4500
16782 

4600
16917 

4700
17056 

4800
17186 

4900
17310 

5000
17470 

5100
17589 

5200
17725 

5300
17863 

5400
18010 

5500
18136 

5600
18311 

5700
18440 

5800
18590 

5900
18736 

6000
18884 

6100
19011 

6200
19133 

6300
19225 

6400
19345 

6500
19478 

6600
19595 

6700
19743 

6800
19869 

6900
19990 

7000
20094 

7100
20199 

7200
20292 

7300
20396 

7400
20481 

7500
20559 

7600
20649 

7700
20738 

7800
20859 

7900
20

In [86]:
from nltk.corpus import wordnet

#syns = wordnet.synsets("program")
#print(syns[0].lemmas()[0].name())
#print(syns)

#syns = wordnet.synsets("plan")
#print(syns[0].lemmas()[0].name())
#print(syns)

syngroups = set()
unrecognized = 0

for word in words_universe_set:
    syns = wordnet.synsets(word)
    #print(word, syns)
    if len(syns) > 0:
        syngroups.add(wordnet.synsets(word)[0].lemmas()[0].name())
    else:
        unrecognized = unrecognized + 1

print(len(syngroups))
print(unrecognized)
#print(syngroups)

syngroups_level_2 = set()
unrecognized = 0

for word in syngroups:
    syns = wordnet.synsets(word)
    #print(word, syns)
    if len(syns) > 0:
        syngroups_level_2.add(wordnet.synsets(word)[0].lemmas()[0].name())
    else:
        unrecognized = unrecognized + 1
        
print(len(syngroups))
print(unrecognized)

14744
11688
14744
0


In [77]:
wordnet.synsets("sus")

[Synset('sus.n.01')]

In [8]:
training_input, expected_output, training_combined = convert_to_bits_string(training_df)
test_input, test_expected_output, test_combined = convert_to_bits_string(test_df)

(145664, 17)
(36416, 17)


In [9]:
print(len(training_input))
print(len(expected_output))

print(len(test_input))
print(len(test_expected_output))

145664
145664
36416
36416


In [11]:
# To get Firmino's implementation to return different to everything 1,
# I had to train only on them first 100 observations, set the tupple 
# size to 50 or more and the bleaching to True.
#w = WiSARD(num_bits_addr = 50, bleaching = True)

#a_training_input = training_input[:100]
#a_expected_output = expected_output[:100]

#w.fit(a_training_input, a_expected_output)

# Using personal Wisard implementation

wann = Wisard(50, 3546)
wann.train(training_input, expected_output)


Number of classes being trained: 2
dict_keys(['1', '0'])
Number of training samples for class 1: 123566
Number of training samples for class 0: 22098


In [12]:
#eo = np.sum(expected_output)

#print(expected_output[0:100])

# In-sample performance
#print()

In [13]:
#Evaluates Guilherme's wisard implementation
#def evaluate_performance(test_data_combined):
#    correct_predictions = 0
#    for combined in test_data_combined:
#        prediction = wann.predict(combined[0])
#        #print(prediction)
#        prediction = prediction["class"]
#        expected = combined[1]
#        #print(prediction, expected)
#        if prediction == expected:
#            #print("Correct!")
#            correct_predictions = correct_predictions + 1
#    print(correct_predictions)
#    return correct_predictions

def evaluate_performance(test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    for combined in test_data_combined:
        prediction = wann.predict(combined[0])
        prediction = prediction["class"]
        
        if prediction == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction == "0":
                zeros_correct = zeros_correct + 1
            elif prediction == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction == "1":
                ones_wrong = ones_wrong + 1
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    return correct_predictions

#Evaluates Firminos's wisard implementation
def evaluate_performance2(test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    for combined in test_data_combined:
        prediction = w.predict([combined[0]])
        if prediction[0] == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction[0] == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction[0] == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction[0] == "0":
                zeros_correct = zeros_correct + 1
            elif prediction[0] == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction[0] == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction[0] == "1":
                ones_wrong = ones_wrong + 1
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    return correct_predictions

In [14]:
# First results, used as reference for improvement:
# In-sample performance:  0.711748840665465
# Expected out-sample performance:  0.585333986607872

# Evaluates Guilherme's wisard implementation
#print("In-sample performance: ", evaluate_performance(training_combined[:5]) / len(training_combined[:5]))
#print("Expected out-sample performance: ", evaluate_performance(test_combined[:5]) / len(test_combined[:5]))
#print("In-sample performance: ", evaluate_performance(training_combined) / len(training_combined))
#print("Expected out-sample performance: ", evaluate_performance(test_combined) / len(test_combined))

print("In-sample performance: ", float(evaluate_performance(training_combined)) / float(len(training_combined)))
print("Ones distribution: ", float(train_data["project_is_approved"].sum()) / float(len(train_data["project_is_approved"])))
print("Ones: ", train_data["project_is_approved"].sum(), "Zeros: ", train_data["project_is_approved"].sum() - len(train_data["project_is_approved"]))
print("\n")
print("Expected out-sample performance: ", float(evaluate_performance(test_combined)) / float(len(test_combined)))
print("Ones distribution: ", float(test_data["project_is_approved"].sum()) / float(len(test_data["project_is_approved"])))
print("Ones: ", test_data["project_is_approved"].sum(), "Zeros: ", (test_data["project_is_approved"].sum() - len(test_data["project_is_approved"])))


# Evaluates Firmino's wisard implementation
#print("In-sample performance: ", float(evaluate_performance2(training_combined)) / float(len(training_combined)))
#print("Ones distribution: ", float(train_data["project_is_approved"].sum()) / float(len(train_data["project_is_approved"])))
#print("Ones: ", train_data["project_is_approved"].sum(), "Zeros: ", train_data["project_is_approved"].sum() - len(train_data["project_is_approved"]))
#print("\n")
#print("Expected out-sample performance: ", float(evaluate_performance2(test_combined)) / float(len(test_combined)))
#print("Ones distribution: ", float(test_data["project_is_approved"].sum()) / float(len(test_data["project_is_approved"])))
#print("Ones: ", test_data["project_is_approved"].sum(), "Zeros: ", (test_data["project_is_approved"].sum() - len(test_data["project_is_approved"])))

Number of observations:  145664
Predicted correctly:  123568
Predicted wrongly:  22096
Predicted zeros:  2
Predicted ones:  145662
Zeros correct:  2
Ones correct:  123566
Zeros wrong:  0
Ones Wrong:  22096
In-sample performance:  0.8483084358523726
Ones distribution:  0.8482947056239016
Ones:  123566 Zeros:  -22098


Number of observations:  36416
Predicted correctly:  30779
Predicted wrongly:  5637
Predicted zeros:  1
Predicted ones:  36415
Zeros correct:  0
Ones correct:  30779
Zeros wrong:  1
Ones Wrong:  5636
Expected out-sample performance:  0.8452054042179262
Ones distribution:  0.8452328646748682
Ones:  30780 Zeros:  -5636


In [15]:
for discriminator in wann.discriminators:
    print(discriminator.input_class)
    print(discriminator.input_length)
    print(discriminator.tupple_size)
    print(discriminator.memory)

1
400
50
{0: {'10001100111011001010000100100100001011100010011000': 1, '10011100111011001010000101100100001011100010011000': 1, '10001100111011001010000101100100001001100010011000': 1, '10001100111111001010000101100100001001100010011000': 1, '10001100111111001010000101100100001011100010011000': 1, '10011100111111001010000101100100001011100010011000': 1, '10011100111111001010000100100100001011100010011000': 1, '10001100111111001010000100100100001011100010011000': 1, '10011100111011001010000100100100001011100010011000': 1, '10001100111011001010000100100100001001100010011000': 1, '10001100111111001010000100100100001001100010011000': 1, '10001100111011001010000101100100001011100010011000': 1}, 1: {'00110101011110110001001111111000111101001010110101': 1, '00110101011010110101001110111000111101001010110101': 1, '00110101011010110101001111111000111101001010110101': 1, '00110111111110110011001111111000111111001010110101': 1, '00110101111110110101001111111000111101001010110101': 1, '00110101011