In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from WiSARD import WiSARD
from Wisard import Wisard

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *


In [2]:
# load data
train_file_path = 'train.csv'
test_file_path = 'test.csv'
resources_file_path = 'resources.csv'

# Read data and store in DataFrame
train_data = pd.read_csv(train_file_path, sep=',')
#test_data = pd.read_csv(test_file_path, sep=',')
resources_data = pd.read_csv(resources_file_path, sep=',')

In [3]:
# splitting the training dataset int training (~80%) and test (~20%), because the official test dataset
# doesn't have the the entries' classification, requiring validation with Kaggle's website

msk = np.random.rand(len(train_data)) < 0.8
train = train_data[msk]
test_data = train_data[~msk]
train_data = train

print(len(train_data))
print(len(test_data))

145864
36216


In [4]:
# The preprocessing part is partialy based on the following "kernel" on Kaggle:
# https://www.kaggle.com/jgoldberg/donorschoose-eda-text-classification/notebook

def preprocess(training_dataframe, resources_dataframe):
    print(training_dataframe.shape)
    print(resources_dataframe.shape)
    
    #
    total_price = resources_dataframe.quantity * resources_dataframe.price
    resources_dataframe["total_price"] = total_price
    
    # dropping irrelevant columns
    resources_dataframe = resources_dataframe.drop(["description", "price"], axis=1)
    training_dataframe = training_dataframe.drop(["teacher_id"], axis=1)
    
    # grouping resources data by id
    grouped_resources_dataframe = resources_dataframe.groupby("id", as_index=False, sort=False).sum()
    grouped_resources_dataframe
    
    # merging the two dataframes
    cleaned_df = pd.merge(training_dataframe, grouped_resources_dataframe, how="inner", on=["id"])
    
    # splitting project categories
    cleaned_df[['category_1','category_2','category_3']] = cleaned_df['project_subject_categories'].str.split(',', 3, expand=True)
    
    cleaned_df["total_price_category"] = pd.cut(
        cleaned_df["total_price"], 
        bins=[0,100,250,500,1000,16000], 
        labels=["0-100","101-250","251-500","501-1000",">1000"]
    )
    
    cleaned_df["n_previous_projects"] = pd.cut(
        cleaned_df["teacher_number_of_previously_posted_projects"],
        bins=[-1,1,5,10,25,50,500],
        labels=['0-1','2-5','6-10','11-25','26-50','51+']
    )
    
    cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
    cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
    cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter
    
    cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")
    
    cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
    cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
    cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
    cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")
    
    #cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    cleaned_df["merged_essays"] = cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    
    # dropping more columns
    cleaned_df = cleaned_df.drop([
        "project_submitted_datetime", 
        "project_essay_1", 
        "project_essay_2", 
        "project_essay_3", 
        "project_essay_4",
        "quantity",
        "total_price",
        "teacher_number_of_previously_posted_projects"], 
        axis=1
    )
    
    return cleaned_df

# returns a list with the following format
# [
#     ["001101...010101", 1]
#     ["001111...000001", 1]
#     ["101001...111100", 0]
# ]
def convert_to_bits_string(dataframe):
    print(dataframe.shape)
    
    project_grade_category_mapping = {
        'Grades PreK-2':"0000", 
        'Grades 3-5':"0001", 
        'Grades 6-8':"0010", 
        'Grades 9-12':"0011"
    }

    teacher_prefix_mapping = {
        'Ms.':"0000", 
        'Mrs.':"0001", 
        'Mr.':"0010", 
        'Teacher':"0011", 
        'Dr.':"0100", 
        'unknown':"0101"
    }

    n_previous_projects_mapping = {
        '0-1':"0000",
        '2-5':"0001",
        '6-10':"0010",
        '11-25':"0011",
        '26-50':"0100",
        '51+':"0101"
    }

    total_price_category_mapping = {
        "0-100":"0000",
        "101-250":"0001",
        "251-500":"0010",
        "501-1000":"0011",
        ">1000":"0100"
    }
    
    combined_input_and_expected_output = []
    input_list = []
    expected_output_list = []
    
    n = 0
    for index, row in dataframe.iterrows():
        #print(row)
        #if n >= 10:
        #    break
        #n = n + 1
        
        bits_string = project_grade_category_mapping[row["project_grade_category"]]
        bits_string = bits_string + teacher_prefix_mapping[row["teacher_prefix"]]
        bits_string = bits_string + n_previous_projects_mapping[row["n_previous_projects"]]
        bits_string = bits_string + total_price_category_mapping[row["total_price_category"]]
        
        bit_int_list = [int(c) for c in bits_string]
        expected_output = str(row["project_is_approved"])
        
        input_list.append(bit_int_list)
        expected_output_list.append(expected_output)
        
        combined_input_and_expected_output.append([bit_int_list, expected_output])
        
    return input_list, expected_output_list, combined_input_and_expected_output



In [5]:
trainig_df = preprocess(train_data, resources_data)
trainig_df

(145864, 16)
(1541272, 4)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p036502,Ms.,NV,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,My students need 6 Ipod Nano's to create and d...,1,Literacy & Language,,,501-1000,26-50,11,4,Most of my kindergarten students come from low...
1,p233823,Ms.,UT,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,My students need the 3doodler. We are an SEM s...,1,Math & Science,Literacy & Language,,251-500,2-5,1,1,Hello;\r\nMy name is Mrs. Brotherton. I teach ...
2,p185307,Mr.,NC,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students need balls and other activity equi...,0,Health & Sports,,,501-1000,11-25,8,3,My students are the greatest students but are ...
3,p013780,Mr.,CA,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students need a water filtration system for...,1,Health & Sports,,,501-1000,26-50,8,3,My students are athletes and students who are ...
4,p114989,Ms.,IN,Grades 6-8,Math & Science,Mathematics,Wobble Chairs Help Fidgety Kids Focus,My students need seating that allows the most ...,1,Math & Science,,,251-500,11-25,9,3,My seventh graders dream big. They can't wait ...
5,p191410,Mrs.,IL,Grades PreK-2,Literacy & Language,Literacy,Snuggle Up With A Good Book,My students need 2 youth sized reclining chair...,1,Literacy & Language,,,101-250,11-25,9,3,I teach first grade in a small farming town in...
6,p226941,Mrs.,CA,Grades PreK-2,"Literacy & Language, Math & Science","Literacy, Mathematics",Technology Boost!,My students need a projector and tablets to al...,1,Literacy & Language,Math & Science,,501-1000,0-1,9,3,My children come to school everyday with the s...
7,p173555,Mrs.,NY,Grades 3-5,Literacy & Language,Literature & Writing,Extra! Extra! Read all about it!! We love to ...,"My students need good books, with life lessons...",0,Literacy & Language,,,501-1000,2-5,11,4,"Each day my fifth graders walk into our \""home..."
8,p055350,Ms.,FL,Grades PreK-2,Special Needs,Special Needs,Flexible seating,My students need flexible seating options to p...,1,Special Needs,,,251-500,0-1,2,1,The children at our school come from a variety...
9,p060293,Mrs.,NJ,Grades 3-5,"Literacy & Language, Math & Science","Literacy, Mathematics",A Comfy Seat!,My students need cushions to sit on during cen...,1,Literacy & Language,Math & Science,,101-250,11-25,11,4,My third graders are eager to learn new concep...


In [6]:
test_df = preprocess(test_data, resources_data)
test_df

(36216, 16)
(1541272, 5)


Unnamed: 0,id,teacher_prefix,school_state,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,project_is_approved,category_1,category_2,category_3,total_price_category,n_previous_projects,month,quarter,merged_essays
0,p039565,Mrs.,GA,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,My students need matching shirts to wear for d...,0,Music & The Arts,Health & Sports,,251-500,0-1,4,2,Our elementary school is a culturally rich sch...
1,p063374,Mrs.,DE,Grades PreK-2,"Applied Learning, Literacy & Language","Character Education, Literature & Writing",Need to Reach Our Virtual Mentors!!!,My students need tablets in order to communic...,1,Applied Learning,Literacy & Language,,501-1000,0-1,11,4,My kids tell me each day that they want to mak...
2,p103285,Mrs.,MO,Grades PreK-2,Health & Sports,Health & Wellness,Active Kindergartners,My students need stability stools and inflatab...,1,Health & Sports,,,251-500,0-1,8,3,Kindergarten is the new first grade. My studen...
3,p181781,Mrs.,SC,Grades PreK-2,"Applied Learning, Literacy & Language","Early Development, Literature & Writing",Fabulous Firsties-Wiggling to Learn!,My students need wiggle stools to allow them t...,1,Applied Learning,Literacy & Language,,251-500,0-1,8,3,First graders are fantastic! They are excited ...
4,p030093,Teacher,VA,Grades PreK-2,"Literacy & Language, Math & Science","Literature & Writing, Mathematics","Writing, Writing, Writing!",My students need individual dry erase boards t...,1,Literacy & Language,Math & Science,,101-250,0-1,3,1,My classroom has 24 students in it. We have an...
5,p225747,Ms.,PA,Grades 3-5,"Applied Learning, Literacy & Language","Character Education, Literacy",Life Lessons through Literacy,My students need books that teach life lessons...,1,Applied Learning,Literacy & Language,,101-250,2-5,12,4,"My school is located in a high poverty area, j..."
6,p074849,Mrs.,IN,Grades PreK-2,Math & Science,"Health & Life Science, Mathematics",Taking Learning Outside of the Classroom!,My students need engaging math and science hom...,1,Math & Science,,,501-1000,51+,1,1,"\""Learning is an active process. We learn by d..."
7,p230221,Mrs.,PA,Grades 6-8,Math & Science,Applied Sciences,A-Maze-ing Problem Solving Strategies,My students need opportunities to increase the...,1,Math & Science,,,251-500,0-1,8,3,As a teacher in a low-income/high poverty scho...
8,p070918,Mr.,TX,Grades PreK-2,"Literacy & Language, Math & Science","Literature & Writing, Mathematics",Mini Devices With Many Opportunities,"My students need four iPad minis, and four pro...",1,Literacy & Language,Math & Science,,>1000,0-1,6,2,"\""You can dream, create, design, and build the..."
9,p116102,Mrs.,NC,Grades 6-8,"Math & Science, Special Needs","Mathematics, Special Needs",Balance Ball Seating Needed for Engaging Group...,My students need balance ball chairs for the g...,1,Math & Science,Special Needs,,251-500,26-50,8,3,As a teacher in a low-income/high poverty scho...


In [7]:
training_input, expected_output, training_combined = convert_to_bits_string(trainig_df)
test_input, test_expected_output, test_combined = convert_to_bits_string(test_df)

(145864, 17)
(36216, 17)


In [8]:
print(len(training_input))
print(len(expected_output))

print(len(test_input))
print(len(test_expected_output))

145864
145864
36216
36216


In [9]:
print(training_input[0])
print(training_combined[0])

print(training_input[1])
print(training_combined[1])

print(test_input[0])
print(test_combined[0])

print(test_input[1])
print(test_combined[1])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1], '1']
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]
[[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0], '1']
[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]
[[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], '0']
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1]
[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1], '1']


In [10]:
#retina_length = 16
#num_bits_addr = 16
#bleaching = False

#print(len(training_input[0]))

w = WiSARD(num_bits_addr = 2, bleaching = False)

w.fit(training_input, expected_output)

#wann = Wisard(2, 3546)
#wann.train(training_input, expected_output)


In [11]:
#eo = np.sum(expected_output)

print(expected_output[0:10])

# In-sample performance
#print()

['1', '1', '0', '1', '1', '1', '1', '0', '1', '1']


In [12]:
#Evaluates Guilherme's wisard implementation
#def evaluate_performance(test_data_combined):
#    correct_predictions = 0
#    for combined in test_data_combined:
#        if wann.predict(combined[0])["class"] == combined[1]:
#            correct_predictions = correct_predictions + 1
#    return correct_predictions

In [13]:
# First results, used as reference for improvement:
# In-sample performance:  0.711748840665465
# Expected out-sample performance:  0.585333986607872

# Evaluates Guilherme's wisard implementation
#print("In-sample performance: ", evaluate_performance(training_combined) / len(training_input))
#print("Expected out-sample performance: ", evaluate_performance(test_combined) / len(test_input))

In [14]:
# Saving reference
#reference = {
#    "train_data": train_data, 
#    "training_input": training_input, 
#    "expected_output": expected_output, 
#    "training_combined": training_combined, 
#    "test_data": test_data, 
#    "test_input": test_input, 
#    "test_expected_output": test_expected_output, 
#    "test_combined": test_combined, 
#    "wann": wann
#}

#import pickle
#pickle.dump(reference, open("reference.pickle", "wb"))

In [15]:
# Saving last
#reference = {
#    "train_data": train_data, 
#    "training_input": training_input, 
#    "expected_output": expected_output, 
#    "training_combined": training_combined, 
#    "test_data": test_data, 
#    "test_input": test_input, 
#    "test_expected_output": test_expected_output, 
#    "test_combined": test_combined, 
#    "wann": wann
#}

#import pickle
#pickle.dump(reference, open("last.pickle", "wb"))

In [16]:
#for discriminator in wann.discriminators:
#    print(discriminator.input_class)
#    print(discriminator.input_length)
#    print(discriminator.tupple_size)
#    print(discriminator.memory)

In [18]:
print(w.classes_)

print(test_combined[0])
print(np.array(test_combined[0][0]))
print(type(np.array(test_combined[0][0])))
print(w.predict(test_input[:10]))

dict_keys(['0', '1'])
[[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], '0']
[0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0]
<class 'numpy.ndarray'>


TypeError: 'dict_keys' object does not support indexing