In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from WiSARD import WiSARD
from Wisard import Wisard

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import *

import json


KeyboardInterrupt: 

In [None]:
# load data
train_file_path = 'train.csv'
test_file_path = 'test.csv'
resources_file_path = 'resources.csv'

# Read data and store in DataFrame
train_data = pd.read_csv(train_file_path, sep=',')
#test_data = pd.read_csv(test_file_path, sep=',')
resources_data = pd.read_csv(resources_file_path, sep=',')

In [None]:
# splitting the training dataset int training (~80%) and test (~20%), because the official test dataset
# doesn't have the the entries' classification, requiring validation with Kaggle's website

#msk = np.random.rand(len(train_data)) < 0.8
#train = train_data[msk]
#test_data = train_data[~msk]
#train_data = train

print(train_data["project_is_approved"].sum())
print("Percent aproved: ", float(train_data["project_is_approved"].sum()) / float(len(train_data)), "\n")
#print(train_data.groupby('project_subject_categories')['project_subject_categories'].nunique())

train = train_data.sample(frac=0.8,random_state=200)
test_data = train_data.drop(train.index)

train_data = train

print(len(train_data))
print(len(test_data))

In [None]:
# The preprocessing part is partialy based on the following "kernel" on Kaggle:
# https://www.kaggle.com/jgoldberg/donorschoose-eda-text-classification/notebook

def preprocess(training_dataframe, resources_dataframe):
    print(training_dataframe.shape)
    print(resources_dataframe.shape)
    
    #
    total_price = resources_dataframe.quantity * resources_dataframe.price
    resources_dataframe["total_price"] = total_price
    
    # dropping irrelevant columns
    resources_dataframe = resources_dataframe.drop(["description", "price"], axis=1)
    training_dataframe = training_dataframe.drop(["teacher_id"], axis=1)
    
    # grouping resources data by id
    grouped_resources_dataframe = resources_dataframe.groupby("id", as_index=False, sort=False).sum()
    grouped_resources_dataframe
    
    # merging the two dataframes
    cleaned_df = pd.merge(training_dataframe, grouped_resources_dataframe, how="inner", on=["id"])
    
    # splitting project categories
    
    cleaned_df[['category_1','category_2', "category_3"]] = cleaned_df['project_subject_categories'].str.split(', ', 3, expand=True)
    
    #cleaned_df["category_1"] = cleaned_df["category_1"].fillna("Not Informed")
    cleaned_df["category_2"] = cleaned_df["category_2"].fillna("Not Informed")
    
    cleaned_df["total_price_category"] = pd.cut(
        cleaned_df["total_price"], 
        bins=[0,100,250,500,1000,16000], 
        labels=["0-100","101-250","251-500","501-1000",">1000"]
    )
    
    cleaned_df["n_previous_projects"] = pd.cut(
        cleaned_df["teacher_number_of_previously_posted_projects"],
        bins=[-1,1,5,10,25,50,500],
        labels=['0-1','2-5','6-10','11-25','26-50','51+']
    )
    
    cleaned_df["project_submitted_datetime"] = pd.to_datetime(cleaned_df['project_submitted_datetime'])
    cleaned_df["month"] = cleaned_df['project_submitted_datetime'].dt.month
    cleaned_df["quarter"] = cleaned_df['project_submitted_datetime'].dt.quarter
    
    cleaned_df["teacher_prefix"] = cleaned_df["teacher_prefix"].fillna("unknown")
    
    cleaned_df["project_essay_1"] = cleaned_df["project_essay_1"].fillna("")
    cleaned_df["project_essay_2"] = cleaned_df["project_essay_2"].fillna("")
    cleaned_df["project_essay_3"] = cleaned_df["project_essay_3"].fillna("")
    cleaned_df["project_essay_4"] = cleaned_df["project_essay_4"].fillna("")
    
    #cleaned_df["merged_essays"] = cleaned_df['project_title'].astype(str) + " " + cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    cleaned_df["merged_essays"] = cleaned_df['project_essay_1'].astype(str) + " " + cleaned_df['project_essay_2'].astype(str) + " " + cleaned_df['project_essay_3'].astype(str) + " " + cleaned_df['project_essay_4'].astype(str)
    
    # dropping more columns
    cleaned_df = cleaned_df.drop([
        "project_submitted_datetime", 
        "project_essay_1", 
        "project_essay_2", 
        "project_essay_3", 
        "project_essay_4",
        "quantity",
        "total_price",
        "teacher_number_of_previously_posted_projects"], 
        axis=1
    )
    
    return cleaned_df

# returns a list with the following format
# [
#     ["001101...010101", 1]
#     ["001111...000001", 1]
#     ["101001...111100", 0]
# ]
def convert_to_bits_string(dataframe):
    print(dataframe.shape)
    
    project_grade_category_mapping = {
        'Grades PreK-2':"00000000000000000000000000000000000000000000000000", 
        'Grades 3-5':"10000000000000000000000000000000000000000000000000", 
        'Grades 6-8':"11000000000000000000000000000000000000000000000000", 
        'Grades 9-12':"11100000000000000000000000000000000000000000000000"
    }

    teacher_prefix_mapping = {
        'Ms.':"11110000000000000000000000000000000000000000000000", 
        'Mrs.':"11111000000000000000000000000000000000000000000000", 
        'Mr.':"11111100000000000000000000000000000000000000000000", 
        'Teacher':"11111110000000000000000000000000000000000000000000", 
        'Dr.':"11111111000000000000000000000000000000000000000000", 
        'unknown':"11111111100000000000000000000000000000000000000000"
    }

    n_previous_projects_mapping = {
        '0-1':"11111111110000000000000000000000000000000000000000",
        '2-5':"11111111111000000000000000000000000000000000000000",
        '6-10':"11111111111100000000000000000000000000000000000000",
        '11-25':"11111111111110000000000000000000000000000000000000",
        '26-50':"11111111111111000000000000000000000000000000000000",
        '51+':"11111111111111100000000000000000000000000000000000"
    }

    total_price_category_mapping = {
        "0-100":"11111111111111110000000000000000000000000000000000",
        "101-250":"11111111111111111000000000000000000000000000000000",
        "251-500":"11111111111111111100000000000000000000000000000000",
        "501-1000":"11111111111111111110000000000000000000000000000000",
        ">1000":"11111111111111111111000000000000000000000000000000"
    }
    
    month_mapping = {
        "1":"11111111111111111111100000000000000000000000000000",
        "2":"11111111111111111111110000000000000000000000000000",
        "3":"11111111111111111111111000000000000000000000000000",
        "4":"11111111111111111111111100000000000000000000000000",
        "5":"11111111111111111111111110000000000000000000000000",
        "6":"11111111111111111111111111000000000000000000000000",
        "7":"11111111111111111111111111100000000000000000000000",
        "8":"11111111111111111111111111110000000000000000000000",
        "9":"11111111111111111111111111111000000000000000000000",
        "10":"11111111111111111111111111111100000000000000000000",
        "11":"11111111111111111111111111111110000000000000000000",
        "12":"11111111111111111111111111111111000000000000000000"
    }
    
    quarter_mapping = {
        "1":"11111111111111111111111111111111100000000000000000",
        "2":"11111111111111111111111111111111110000000000000000",
        "3":"11111111111111111111111111111111111000000000000000",
        "4":"11111111111111111111111111111111111111111111100000"
    }
    
    category_mapping = {
        "Not Informed":"11111111111111111111111111111111111100000000000000",
        "Applied Learning":"11111111111111111111111111111111111110000000000000",
        "Health & Sports":"11111111111111111111111111111111111111000000000000",
        "History & Civics":"11111111111111111111111111111111111111100000000000",
        "Literacy & Language":"11111111111111111111111111111111111111110000000000",
        "Math & Science":"11111111111111111111111111111111111111111000000000",
        "Music & The Arts":"11111111111111111111111111111111111111111100000000",
        "Special Needs":"11111111111111111111111111111111111111111110000000",
        "Warmth":"11111111111111111111111111111111111111111111000000",
        "Care & Hunger":"11111111111111111111111111111111111111111111000000", # Equals to warmth, because they are the same thing
    }
    
    combined_input_and_expected_output = []
    input_list = []
    expected_output_list = []
    
    n = 0
    for index, row in dataframe.iterrows():
        #print(row)
        #if n >= 10:
        #    break
        #n = n + 1
        
        bits_string = ""
        bits_string = project_grade_category_mapping[row["project_grade_category"]]
        bits_string = bits_string + teacher_prefix_mapping[row["teacher_prefix"]]
        bits_string = bits_string + n_previous_projects_mapping[row["n_previous_projects"]]
        bits_string = bits_string + total_price_category_mapping[row["total_price_category"]]
        
        bits_string = bits_string + month_mapping[str(row["month"])]
        bits_string = bits_string + quarter_mapping[str(row["quarter"])]
        bits_string = bits_string + category_mapping[row["category_1"]]
        bits_string = bits_string + category_mapping[row["category_2"]]
        
        bit_int_list = [int(c) for c in bits_string]
        expected_output = str(row["project_is_approved"])
        
        input_list.append(bit_int_list)
        expected_output_list.append(expected_output)
        
        combined_input_and_expected_output.append([bit_int_list, expected_output])
        
    return input_list, expected_output_list, combined_input_and_expected_output



In [None]:
training_df = preprocess(train_data, resources_data)
training_df

In [None]:
test_df = preprocess(test_data, resources_data)
test_df

In [None]:
print(training_df["category_1"].unique())
print(training_df["category_2"].unique())
print(training_df["category_3"].unique())
print(test_df["category_1"].unique())
print(test_df["category_2"].unique())
print(test_df["category_3"].unique())

print(test_df["month"].unique())
print(test_df["quarter"].unique())

In [None]:
tokenizer = RegexpTokenizer(r'[a-z]+')
test_df["words_list"] = None
#training_df["words_list"] = training_df["words_list"].astype([])
words_universe_set = set()
count = 0

for row in test_df.index:
    text = test_df.loc[row, "merged_essays"]
    words_list = [word for word in tokenizer.tokenize(text.lower()) if word not in stopwords.words('english')]
    
    words_universe_set.update(words_list)
    test_df.loc[row , "words_list"] = json.dumps(words_list)
    #print(words_list)
    #print(words_universe_set)
    count = count + 1
    if count % 100 == 0:
        print(count)
        print(len(words_universe_set), "\n")
        #break
        
print(len(words_universe_set))
#print(words_universe_set)
#print(test_df["words_list"][:10])

#print(test_df["words_list"][:1])
#print(json.loads((test_df["words_list"][1])))
#a_list = json.loads(test_df["words_list"][1])
#print("\n\n\n", a_list)
#print("\n\n\n", a_list[3])

In [None]:
from nltk.corpus import wordnet

#syns = wordnet.synsets("program")
#print(syns[0].lemmas()[0].name())
#print(syns)

#syns = wordnet.synsets("plan")
#print(syns[0].lemmas()[0].name())
#print(syns)

syngroups = set()
unrecognized = 0

for word in words_universe_set:
    syns = wordnet.synsets(word)
    #print(word, syns)
    if len(syns) > 0:
        syngroups.add(wordnet.synsets(word)[0].lemmas()[0].name())
    else:
        unrecognized = unrecognized + 1

print(len(syngroups))
print(unrecognized)
#print(syngroups)

syngroups_level_2 = set()
unrecognized = 0

for word in syngroups:
    syns = wordnet.synsets(word)
    #print(word, syns)
    if len(syns) > 0:
        syngroups_level_2.add(wordnet.synsets(word)[0].lemmas()[0].name())
    else:
        unrecognized = unrecognized + 1
        
print(len(syngroups))
print(unrecognized)

In [None]:
wordnet.synsets("sus")

In [None]:
training_input, expected_output, training_combined = convert_to_bits_string(training_df)
test_input, test_expected_output, test_combined = convert_to_bits_string(test_df)

In [None]:
print(len(training_input))
print(len(expected_output))

print(len(test_input))
print(len(test_expected_output))

In [None]:
# To get Firmino's implementation to return different to everything 1,
# I had to train only on them first 100 observations, set the tupple 
# size to 50 or more and the bleaching to True.
#w = WiSARD(num_bits_addr = 50, bleaching = True)

#a_training_input = training_input[:100]
#a_expected_output = expected_output[:100]

#w.fit(a_training_input, a_expected_output)

# Using personal Wisard implementation

wann = Wisard(50, 3546)
wann.train(training_input, expected_output)


In [None]:
#eo = np.sum(expected_output)

#print(expected_output[0:100])

# In-sample performance
#print()

In [None]:
#Evaluates Guilherme's wisard implementation
#def evaluate_performance(test_data_combined):
#    correct_predictions = 0
#    for combined in test_data_combined:
#        prediction = wann.predict(combined[0])
#        #print(prediction)
#        prediction = prediction["class"]
#        expected = combined[1]
#        #print(prediction, expected)
#        if prediction == expected:
#            #print("Correct!")
#            correct_predictions = correct_predictions + 1
#    print(correct_predictions)
#    return correct_predictions

def evaluate_performance(test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    for combined in test_data_combined:
        prediction = wann.predict(combined[0])
        prediction = prediction["class"]
        
        if prediction == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction == "0":
                zeros_correct = zeros_correct + 1
            elif prediction == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction == "1":
                ones_wrong = ones_wrong + 1
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    return correct_predictions

#Evaluates Firminos's wisard implementation
def evaluate_performance2(test_data_combined):
    #print("Number of observations: ", test_data_combined)
    correct_predictions = 0
    wrong_predictions = 0
    zeros_predicted = 0
    ones_predicted = 0
    zeros_correct = 0
    ones_correct = 0
    zeros_wrong = 0
    ones_wrong = 0
    for combined in test_data_combined:
        prediction = w.predict([combined[0]])
        if prediction[0] == "0":
            #print("Prediction: ", prediction[0], combined)
            zeros_predicted = zeros_predicted + 1
        elif prediction[0] == "1":
            ones_predicted = ones_predicted + 1
        #print(prediction)
        expected = combined[1]
        #print(prediction, expected)
        if prediction[0] == expected:
            #print("Correct!")
            correct_predictions = correct_predictions + 1
            
            if prediction[0] == "0":
                zeros_correct = zeros_correct + 1
            elif prediction[0] == "1":
                ones_correct = ones_correct + 1
        else:
            wrong_predictions = wrong_predictions + 1
            
            if prediction[0] == "0":
                zeros_wrong = zeros_wrong + 1
            elif prediction[0] == "1":
                ones_wrong = ones_wrong + 1
    
    print("Number of observations: ", len(test_data_combined))
    print("Predicted correctly: ", correct_predictions)
    print("Predicted wrongly: ", wrong_predictions)
    print("Predicted zeros: ", zeros_predicted)
    print("Predicted ones: ", ones_predicted)
    print("Zeros correct: ", zeros_correct)
    print("Ones correct: ", ones_correct)
    print("Zeros wrong: ", zeros_wrong)
    print("Ones Wrong: ", ones_wrong)
    return correct_predictions

In [None]:
# First results, used as reference for improvement:
# In-sample performance:  0.711748840665465
# Expected out-sample performance:  0.585333986607872

# Evaluates Guilherme's wisard implementation
#print("In-sample performance: ", evaluate_performance(training_combined[:5]) / len(training_combined[:5]))
#print("Expected out-sample performance: ", evaluate_performance(test_combined[:5]) / len(test_combined[:5]))
#print("In-sample performance: ", evaluate_performance(training_combined) / len(training_combined))
#print("Expected out-sample performance: ", evaluate_performance(test_combined) / len(test_combined))

print("In-sample performance: ", float(evaluate_performance(training_combined)) / float(len(training_combined)))
print("Ones distribution: ", float(train_data["project_is_approved"].sum()) / float(len(train_data["project_is_approved"])))
print("Ones: ", train_data["project_is_approved"].sum(), "Zeros: ", train_data["project_is_approved"].sum() - len(train_data["project_is_approved"]))
print("\n")
print("Expected out-sample performance: ", float(evaluate_performance(test_combined)) / float(len(test_combined)))
print("Ones distribution: ", float(test_data["project_is_approved"].sum()) / float(len(test_data["project_is_approved"])))
print("Ones: ", test_data["project_is_approved"].sum(), "Zeros: ", (test_data["project_is_approved"].sum() - len(test_data["project_is_approved"])))


# Evaluates Firmino's wisard implementation
#print("In-sample performance: ", float(evaluate_performance2(training_combined)) / float(len(training_combined)))
#print("Ones distribution: ", float(train_data["project_is_approved"].sum()) / float(len(train_data["project_is_approved"])))
#print("Ones: ", train_data["project_is_approved"].sum(), "Zeros: ", train_data["project_is_approved"].sum() - len(train_data["project_is_approved"]))
#print("\n")
#print("Expected out-sample performance: ", float(evaluate_performance2(test_combined)) / float(len(test_combined)))
#print("Ones distribution: ", float(test_data["project_is_approved"].sum()) / float(len(test_data["project_is_approved"])))
#print("Ones: ", test_data["project_is_approved"].sum(), "Zeros: ", (test_data["project_is_approved"].sum() - len(test_data["project_is_approved"])))

In [None]:
for discriminator in wann.discriminators:
    print(discriminator.input_class)
    print(discriminator.input_length)
    print(discriminator.tupple_size)
    print(discriminator.memory)