# Imports

In [38]:
from pathlib import Path

import feather
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

import utils

# Load Data

In [2]:
PATH = Path('data/processed/')

In [3]:
%ls {PATH}

feature-matrix-stage1  train_num              valid_seq
feature-matrix-stage2  train_seq
labels-dummies         valid_num


In [4]:
X = feather.read_dataframe(PATH / 'feature-matrix-stage2').set_index('index')
X.index.name = ''
y = feather.read_dataframe(PATH / 'labels-dummies').set_index('index')
X.index.name = ''
X.head()

Unnamed: 0,FTE,Total,text
,,,
134338.0,1.0,50471.81,General Fund Teacher-Elementary KINDERGAR...
206341.0,,3477.86,RGN GOB (blank) CONTRACTOR SERVICES UNDESI...
326408.0,1.0,62237.13,General Purpose School TCHER 2ND GRADE Pers...
364634.0,,22.3,"UNALLOC BUDGETS/SCHOOLS Teacher, Short Term ..."
47683.0,,54.166,"NON-PROJECT Teacher, Secondary (High) TEAC..."


In [5]:
y.head()

Unnamed: 0_level_0,Function__Aides Compensation,Function__Career & Academic Counseling,Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,Function__Facilities Planning,...,Student_Type__Special Education,Student_Type__Unspecified,Use__Business Services,Use__ISPD,Use__Instruction,Use__Leadership,Use__NO_LABEL,Use__O&M,Use__Pupil Services & Enrichment,Use__Untracked Budget Set-Aside
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
134338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
206341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
326408,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
364634,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
47683,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [6]:
# Name of original features
FEATURES = [
    'FTE', 'Facility_or_Department', 'Function_Description', 'Fund_Description',
    'Job_Title_Description', 'Location_Description', 'Object_Description',
    'Position_Extra', 'Program_Description', 'SubFund_Description',
    'Sub_Object_Description', 'Text_1', 'Text_2', 'Text_3', 'Text_4', 'Total'
]

NUMERICAL_FEATURES = [
    'FTE', 'Total'
]

TEXT_FEATURES = [
    'Facility_or_Department', 'Function_Description', 'Fund_Description',
    'Job_Title_Description', 'Location_Description', 'Object_Description',
    'Position_Extra', 'Program_Description', 'SubFund_Description',
    'Sub_Object_Description', 'Text_1', 'Text_2', 'Text_3', 'Text_4',
]
LABELS = [
    'Function', 'Object_Type', 'Operating_Status', 'Position_Type',
    'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use'
]

In [7]:
(X.index == y.index).all()

True

In [8]:
X_train, X_valid, y_train, y_valid = utils.multilabel_train_test_split(X, y, size=0.1, min_count=14)

In [9]:
X_train.shape, X_valid.shape

((360250, 3), (40027, 3))

# Tokenization

In [10]:
MAX_LEN = 200

In [11]:
# Create and fit tokenizer to texts
tokenizer = Tokenizer()               # Will use all words
tokenizer.fit_on_texts(X_train.text.values)

train_seq = tokenizer.texts_to_sequences(X_train.text.values)
train_seq = pad_sequences(train_seq, MAX_LEN)

valid_seq = tokenizer.texts_to_sequences(X_valid.text.values)
valid_seq = pad_sequences(valid_seq, MAX_LEN)

In [12]:
train_seq.shape, valid_seq.shape

((360250, 200), (40027, 200))

In [13]:
print(f'Vocab size : {len(tokenizer.word_index)}')

Vocab size : 3720


In [44]:
pip_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std', RobustScaler())
])

In [45]:
train_num = pip_transformer.fit_transform(X_train[NUMERICAL_FEATURES].values)
valid_num = pip_transformer.transform(X_valid[NUMERICAL_FEATURES].values)

In [None]:
joblib.dump(train_seq, PATH / 'train_seq')
joblib.dump(valid_seq, PATH / 'valid_seq')

In [46]:
joblib.dump(train_num, PATH / 'train_num')
joblib.dump(valid_num, PATH / 'valid_num')

['data/processed/valid_num']

In [54]:
joblib.dump(pip_transformer, 'pip_transformer')

['pip_transformer']

In [30]:
joblib.dump(tokenizer, 'tokenizer')

['tokenizer']

In [31]:
joblib.dump(y_train, PATH / 'y_train')
joblib.dump(y_valid, PATH / 'y_valid')

['data/processed/y_valid']

In [32]:
train_num.shape

(360250, 4)

In [35]:
X_train[NUMERICAL_FEATURES].shape

(360250, 2)

In [36]:
col_transformer.transform(X_train[NUMERICAL_FEATURES].values)

array([[ 0.00000000e+00,  1.41579900e+02, -1.32355421e-01,
        -8.90222791e-02],
       [ 0.00000000e+00,  1.16676000e+03, -1.32355421e-01,
         1.96239623e-01],
       [ 1.00000000e+00,  3.36685374e+04,  8.68445216e-01,
         9.24003480e+00],
       ...,
       [ 1.32249537e-01,  8.71930000e+02,             nan,
         1.14201583e-01],
       [ 1.32249537e-01,  5.65154700e+04,             nan,
         1.55973173e+01],
       [ 1.32249537e-01,  5.21624000e+03,             nan,
         1.32302932e+00]])

In [37]:
X_train[NUMERICAL_FEATURES].values

array([[0.00000000e+00, 1.41579900e+02],
       [0.00000000e+00, 1.16676000e+03],
       [1.00000000e+00, 3.36685374e+04],
       ...,
       [           nan, 8.71930000e+02],
       [           nan, 5.65154700e+04],
       [           nan, 5.21624000e+03]])