In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as w
w.filterwarnings('ignore')

In [None]:
raw_data_train = pd.read_csv('../../DataSets/NumericCategorical/TrainingData.csv')

In [None]:
raw_data_test = pd.read_csv('../../DataSets/NumericCategorical/TestData.csv')
raw_data_submission_format = pd.read_csv('../../DataSets/NumericCategorical/SubmissionFormat.csv')

In [None]:
raw_data_train.head()

In [None]:
raw_data_train.shape   # (18) Features + (9) Labels 

# Preprocessing

### first we have to prepare our training and testing dataset with similar to submission format dataset

In [None]:
raw_data_submission_format.head()

## It contains probability of each

In [None]:
raw_data_submission_format.shape

In [None]:
submission_col = raw_data_submission_format.columns.tolist()
submission_col

In [None]:
# Seperating labels and add first column to make order in correct way
train_labels = pd.DataFrame(raw_data_train['Unnamed: 0']) 
train_labels.head()

In [None]:
######################
# create TrainLabels #
######################
for col in submission_col:
    if col == 'Unnamed: 0': 
        continue
    parts = col.split('__')
    train_labels[col] = np.where(raw_data_train[parts[0]] == parts[1] , 1 , 0) # return 1 
        #where parts[1] == raw_data_train[parts[0]]

In [None]:
sns.heatmap(raw_data_train.isna())
plt.show()

In [None]:
raw_data_train.isna().sum()

In [None]:
print(raw_data_train.columns.shape) # 15 + (1 unnamed + 1 ID + 1 total) features  9 labels
print(raw_data_test.columns.shape)  # 15 + (1 unnamed + 1 ID + 1 total) features  11 unnamed (all empty)        

In [None]:
print(train_labels.shape)
train_labels.head() # One hot encoded labels 
# 9 labels with 104 categories

In [None]:
# Saving the trainlabels is local disk
#train_labels.to_csv('train_labels.csv',index=False)

In [None]:
###########################
### create  Train_data ####
###########################
# this file contains only features which are given during the testing

In [None]:
# these are the labels in dataset each contains different categories
del raw_data_train['Function']
del raw_data_train['Use']
del raw_data_train['Sharing']
del raw_data_train['Reporting']
del raw_data_train['Student_Type']
del raw_data_train['Position_Type']
del raw_data_train['Object_Type']
del raw_data_train['Pre_K']
del raw_data_train['Operating_Status']

In [None]:
raw_data_train.shape

In [None]:
raw_data_train.head()  # Job_Title_Description contains words with 'comma' 
# We need to remove them to make a 'Bag of words'

In [None]:
raw_data_train.info()

In [None]:
# Here you can see that only unnamed: 0 (actually it is ID) contains
# all 4,00,277 non null values other contains not all values

# object represents text data type and int,float represent numerical datatype
# Unnamed: 26 is empty column

In [None]:
# removing 'comma's' and quotes
for column in raw_data_train.columns.tolist():
    if raw_data_train[column].dtype == 'object':
        raw_data_train[column] = raw_data_train[column].str.replace(',' , '')
        raw_data_train[column] = raw_data_train[column].str.replace('"' , '')

In [None]:
sns.heatmap(raw_data_train.isnull())
plt.show() # light pink indicates null values

In [None]:
del raw_data_train['Unnamed: 26']

In [None]:
raw_data_train.isna().sum() # Now it's better

In [None]:
#raw_data_train.to_csv('train_data.csv',index=False)

In [None]:
###########################
## Create test data file ##
###########################

In [None]:
raw_data_test.shape

In [None]:
raw_data_train.shape

In [None]:
print(raw_data_train.columns.shape) # 15 + (1 unnamed + 1 ID + 1 total) features  9 labels
print(raw_data_test.columns.shape)  # 15 + (1 unnamed + 1 ID + 1 total) features  11 unnamed (all empty)        

In [None]:
l = []
for i in raw_data_test.columns:
    if i in raw_data_train.columns:
        continue
    else:
        l.append(i)
        print(i)  # these columns are not in train data sets

In [None]:
# order of columns  must be same 
train_col_order = raw_data_train.columns.tolist()

In [None]:
raw_data_test.info()

In [None]:
for column in raw_data_test.columns.tolist():
    if raw_data_test[column].dtype == 'object':
        raw_data_test[column] = raw_data_test[column].str.replace(',',' ')
        raw_data_test[column] = raw_data_test[column].str.replace('"','')


In [None]:
raw_data_test[train_col_order].head()

In [None]:
#raw_data_test[train_col_order].to_csv('test_data.csv',index=False)

In [None]:
#train_data.iloc[:1000,:].to_csv('train_data.csv',index = False)
#test_data.iloc[:1000,:].to_csv('test_data.csv',index = False)

# Time to create our Online Model for Semi-Structured Text Classification

In [2]:
from datetime import datetime
from math import log , exp, sqrt
import pickle
import sys
import random
import math
import re

In [None]:
#  if you use console screen to get input from user

# what we want from console python.py epochs probability_threshold
# trainprediction.py  4  0.5
#  if len(sys.argv) != 3:  
#   print('Usage: pypy Online.py <epochs> <use_example_probability>')
#   print('epochs is number of passes over the training data')
#   print('use_example_probability is the probability of using an example in an epoch')
#   sys.exit(0)

In [None]:
#epochs = int(sys.argv[1])
#print("Number of epochs:",epochs)

In [None]:
# This is a cheap way to add randomness to the order of training examples
# but use with caution as it does not guarantee all training examples will be seen.
# Use 1 if you want to train in order examples appear in the file
# use_example_probability = float(sys.argv[2]) 
#print ("Use Example Probability:",use_example_probability)

In [None]:
# Specify which original features to keep and discard in the model
#####################################
## Feature_name  ##  columns_index ##
#####################################
# ID's                   =   00     #
# Object_Description     =   01     #
# Text_2                 =   02     #
# SubFund_Description    =   03     #
# Job_Title_Description  =   04     #
# Text_3                 =   05     #
# Text_4                 =   06     #
# Sub_Object_Description =   07     #
# Location_Description   =   08     #
# FTE                    =   09     #
# Function_Description   =   10     #
# Facility_or_Department =   11     #
# Position_Extra         =   12     #
# Total                  =   13     #
# Program_Description    =   14     #
# Fund_Description       =   15     #
# Text_1                 =   16     #
#####################################

# We have to prepare a script in which we just pass test_data with noise and scipt need to take only these columns

In [None]:
# Now restarting kernel so all other variables are delete

In [3]:
train = 'train_data.csv'  # path to training file
label = 'train_label.csv'  # path to label file of training data
test  = 'test_data.csv'  # path to testing file

epochs = 4
use_example_probability = 0.5


In [4]:
train_data = pd.read_csv('train_data.csv')
test_data  = pd.read_csv('test_data.csv')

In [5]:
print(train_data.shape)
print(test_data.shape)

(1000, 17)
(1000, 17)


In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_data.isna())
plt.show()

In [None]:
d = {}
for i in train_data.columns.tolist():
    d[i] = len(train_data[i].unique())
d

In [None]:
sorted(d.items(), key = lambda kv : kv[1])

In [6]:
# preparing import features
originals = list(range(17))
originals  # all columns

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [7]:
# Found that removing 5 : Index (Text_3)  and 7 : index (Sub Object Description) generaly helped
originals.remove(5)
originals.remove(7)
originals # new columns

[0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [8]:
from collections import Counter

In [9]:
# interaction pairs and triples
pairs = [[1,2,3,4],[6,8],[4,12],[1,4,8,10]]
triples = [[1,4,12]]

In [10]:
print('pairs',pairs)
print('triples',triples)

pairs [[1, 2, 3, 4], [6, 8], [4, 12], [1, 4, 8, 10]]
triples [[1, 4, 12]]


In [11]:
D = 2**18 #b number of weights use for each model, we have 104 of them
alpha = 0.10 # learning rate for sgd optimization
print(D,alpha)
epochs , use_example_probability

262144 0.1


(4, 0.5)

In [23]:
# utilities ############################################

# Used for assigning the number feat to a categorical level 0 to N
# INPUT:
#     feat: the numerical predictor
#     b: list representing the boundaries for bins
# OUTPUT:
#     a categorical level 0 to N
def boundary(feat,b):
    f = float(feat)
    s = 0
    for step in b:
        if f < step:
            return s
        s += 1
    return s

# Our hashing function
# INPUT:
#     s: the string or number
# OUTPUT:
#     an integer between 0 and D-1
def hash_it(s):
    return abs(hash(s)) % D

In [None]:
########################################
###  function, generator definition  ###
########################################

In [12]:
# spliting the data into these range's  for FTE ( 9th index )
for i in (np.linspace(train_data.FTE.min() , train_data.FTE.max(),10)):
    print(i,'=====>',train_data.FTE.between(i,i+1).sum())
    
print()

for i in [0.0,0.00431,0.131,0.911,1,50]:
    print(i,'=====>' ,train_data.FTE.between(i,i+1).sum())



0.0 =====> 332
0.11349913633333333 =====> 176
0.22699827266666667 =====> 162
0.34049740900000003 =====> 158
0.45399654533333333 =====> 154
0.5674956816666666 =====> 140
0.6809948180000001 =====> 132
0.7944939543333334 =====> 125
0.9079930906666667 =====> 118
1.021492227 =====> 1

0.0 =====> 332
0.00431 =====> 245
0.131 =====> 172
0.911 =====> 118
1 =====> 110
50 =====> 0


In [13]:
b13 = [-706.968,-8.879,
    7.85,41.972,
    73.798,109.55,
    160.786,219.736,
    318.619,461.23,
    646.73,938.36,
    1317.584,2132.933,
    3652.662,6659.524,
    18551.459,39754.287,
    64813.342,129700000]
b9 = [0.0,0.00431,0.131,0.911,1,50]


In [29]:
# B. Bounded logloss
# INPUT:
#     p: our prediction
#     y: real answer
# OUTPUT
#     bounded logarithmic loss of p given y

def logloss(p, y):
    p = max(min(p, 1. - 10e-15), 10e-15)
    return -log(p) if y == 1. else -log(1. - p)

# C. Get probability estimation on x
# INPUT:
#     x: features
#     w: weights
# OUTPUT:
#     probability of p(y = 1 | x; w)

def predict(x, w):
    wTx = 0.
    for i in x:  # do wTx
        wTx += w[i] * 1.  # w[i] * x[i], but if i in x we got x[i] = 1.
    return 1. / (1. + exp(-max(min(wTx, 100.), -100.)))  # bounded sigmoid

# D. Update given model
# INPUT:
# alpha: learning rate
#     w: weights
#     n: sum of previous absolute gradients for a given feature
#        this is used for adaptive learning rate
#     x: feature, a list of indices
#     p: prediction of our model
#     y: answer
# MODIFIES:
#     w: weights
#     n: sum of past absolute gradients
def update(alpha, w, n, x, p, y,k):
    for i in x:
        # alpha / sqrt(n) is the adaptive learning rate
        # (p - y) * x[i] is the current gradient
        # note that in our case, if i in x then x[i] = 1.
        n[i] += abs(p - y)
        w[i] = w[i] - ((p - y) * 1. ) * alpha / n[i] ** 0.5

In [30]:
# function, generator definitions ############################################

# A. x, y generator
# This is where:
# * All the feature hashes are generated
# * All feature engineering happens
# INPUT:
#     path: path to TrainPredictors.csv or TestData2.csv
#     label_path: (optional) path to TrainLabels.csv
# YIELDS:
#     ID: id of the instance
#     x: list of hashes for predictors
#     y: (if label_path is present) binary label
def data(path, label_path=None):
    # Boundaries for numerical binning of FTE (9) and Total (13)
    b13 = [-706.968,-8.879,
    7.85,41.972,
    73.798,109.55,
    160.786,219.736,
    318.619,461.23,
    646.73,938.36,
    1317.584,2132.933,
    3652.662,6659.524,
    18551.459,39754.287,
    64813.342,129700000]

    b9 = [0.0,0.00431,0.131,0.911,1,50]

    for t, line in enumerate(open(path)):
        # Intcercept term
        x = [0]

        # Skip headers
        if t == 0:
            if label_path:
                label = open(label_path)
                label.readline()  # we don't need the headers
            continue

        # c is an index for the kept original features (15 of them)
        # TODO: drop c and use m for hashing, c was kept for reproducibility
        # m is the index for all the original features (17 of them)
        # feat is the original raw text or value for feature
        c =0
        for m, feat in enumerate(line.rstrip().split(',')):
            # Drop unwanted original features
            if m not in originals:
                continue

            if m == 0:
                ID = int(feat)
            else:
                # convert floats into categorical levels
                # variables 9 (FTE) and 13 (Total) are only numericals
                if m == 13:
                    if feat == "": feat = 0
                    feat = boundary(feat,b13)
                if m == 9:
                    if feat == "": feat = -3
                    feat = boundary(feat,b9)

                # Lowercase and trim so hashes match more often
                feat = str(feat).strip().lower()

                # First we hash the original feature.  For example, if the
                # feature is "special education" and the original feature index is 4, we
                # hash "4_special education"

                original_feature = str(c) + '_' + feat
                x.append( hash_it(original_feature) )

                # Next we break up the original feature value into word parts
                # i.e. create bag-of-word features here
                parts = re.split(' |/|-',feat)

                for i in range(len(parts)):
                    token = parts[i].strip().lower()
                    if token == '': continue

                    # First we hash each token along with its original position index
                    # For example, for the feature value "special education" we hash
                    # its tokens as "4_special" and "4_education" in successive steps of this loop
                    positioned_word = str(c) + '_' + token
                    x.append( hash_it( positioned_word ) )

                    # Next we hash each token by itself, ignoring any information about its position
                    # For example, for "special education" we hash "special" and "education"
                    # regardless of what index position the original feature appeared in.
                    # This views all the feature values in an example as making up a single document
                    x.append( hash_it( token ) )

                c = c + 1

        # Up to this point we've been breaking original features down into smaller features
        # Now we level up and compose original features with each other into larger interction features

        row = line.rstrip().split(',')

        # Start with pairs.  Make pairs from interaction groups defined in pairs variable.
        for interactions in pairs:
            for i in range(len(interactions)):
                for j in range(i+1,len(interactions)):
                    pair = row[interactions[i]]+"_x_"+row[interactions[j]]
                    x.append( hash_it(pair) )

        # Do the same thing for triples
        for triple in triples:
            trip = row[triple[0]]+"_x_"+row[triple[1]] + '_x_' +row[triple[2]]
            x.append( hash_it(trip) )

        if label_path:
            y = [float(y) for y in label.readline().split(',')[1:]]

        yield (ID, x, y) if label_path else (ID, x)


In [31]:
# training and testing #######################################################
start = datetime.now()
# Number of models.
DIM = 104
K = range(DIM)
w = [[0.] * D for k in range(DIM)]
n = [[0.] * D for k in range(DIM)]

random.seed(1234)

loss = 0.
rec = 0


for i in range(2):
    for ID, x, y in data(train,label):
        # Randomly choose whether or not to train with this example in this epoch
        if random.random() > use_example_probability: continue
        # record counter
        rec += 1
        # get predictions and train on all labels
        for k in K:
            p = predict(x, w[k])
            update(alpha, w[k], n[k], x, p, y[k],k)
            loss += logloss(p, y[k])  # for progressive validation

        # print out progress, so that we know everything is working
        if rec % 50000 == 0:
            print('%s\tencountered: %d\tcurrent logloss: %f' % (
                datetime.now(), rec, (loss/float(DIM))/rec))
#h = ',Function__Aides Compensation,Function__Career & Academic Counseling,
#Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,
#Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,
#Function__Facilities Planning,"Function__Finance, Budget, Purchasing & Distribution",Function__Food Services,Function__Governance,
#Function__Human Resources,Function__Instructional Materials & Supplies,Function__Insurance,Function__Legal,
#Function__Library & Media,Function__NO_LABEL,Function__Other Compensation,Function__Other Non-Compensation,
#Function__Parent & Community Relations,Function__Physical Health & Services,Function__Professional Development,Function__Recruitment,
#Function__Research & Accountability,Function__School Administration,Function__School Supervision,Function__Security & Safety,
#Function__Social & Emotional,Function__Special Population Program Management & Support,
#Function__Student Assignment,Function__Student Transportation,Function__Substitute Compensation,
#Function__Teacher Compensation,Function__Untracked Budget Set-Aside,Function__Utilities,
#Object_Type__Base Salary/Compensation,Object_Type__Benefits,Object_Type__Contracted Services,Object_Type__Equipment & Equipment Lease,Object_Type__NO_LABEL,Object_Type__Other Compensation/Stipend,Object_Type__Other Non-Compensation,Object_Type__Rent/Utilities,Object_Type__Substitute Compensation,Object_Type__Supplies/Materials,Object_Type__Travel & Conferences,Operating_Status__Non-Operating,"Operating_Status__Operating, Not PreK-12",Operating_Status__PreK-12 Operating,Position_Type__(Exec) Director,Position_Type__Area Officers,Position_Type__Club Advisor/Coach,Position_Type__Coordinator/Manager,Position_Type__Custodian,Position_Type__Guidance Counselor,Position_Type__Instructional Coach,Position_Type__Librarian,Position_Type__NO_LABEL,Position_Type__Non-Position,Position_Type__Nurse,Position_Type__Nurse Aide,Position_Type__Occupational Therapist,Position_Type__Other,Position_Type__Physical Therapist,Position_Type__Principal,Position_Type__Psychologist,Position_Type__School Monitor/Security,Position_Type__Sec/Clerk/Other Admin,Position_Type__Social Worker,Position_Type__Speech Therapist,Position_Type__Substitute,Position_Type__TA,Position_Type__Teacher,Position_Type__Vice Principal,Pre_K__NO_LABEL,Pre_K__Non PreK,Pre_K__PreK,Reporting__NO_LABEL,Reporting__Non-School,Reporting__School,Sharing__Leadership & Management,Sharing__NO_LABEL,Sharing__School Reported,Sharing__School on Central Budgets,Sharing__Shared Services,Student_Type__Alternative,Student_Type__At Risk,Student_Type__ELL,Student_Type__Gifted,Student_Type__NO_LABEL,Student_Type__Poverty,Student_Type__PreK,Student_Type__Special Education,Student_Type__Unspecified,Use__Business Services,Use__ISPD,Use__Instruction,Use__Leadership,Use__NO_LABEL,Use__O&M,Use__Pupil Services & Enrichment,Use__Untracked Budget Set-Aside'


In [42]:
#predict(x,w[103]) # weight of all 103 columns
#len(x) # 72 hashes
#len(w[0]) #262144 weights

262144

In [None]:
# write out weights
print('writing weights to file')
with open('weights.pkl', 'w') as f:
    pickle.dump(w, f)

output = './submission1234.csv'

with open(output, 'w') as outfile:
    outfile.write(h + '\n')
    for ID, x in data(test):
        outfile.write(str(ID))
        for k in K:
            p = predict(x, w[k])
            outfile.write(',%s' % str(p))
        outfile.write('\n')

print('Done, elapsed time: %s' % str(datetime.now() - start))