# Imports

In [46]:
from warnings import filterwarnings

import joblib
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, chi2

import utils

%matplotlib inline
%config InlineBackend.format_figure = 'retina'
pd.set_option('display.max_columns', 200)
filterwarnings('ignore')
sns.set()

In [2]:
%reload_ext autoreload
%autoreload 2

# Load Data

In [14]:
PATH = Path('data/processed/')
!ls {PATH}

feature-matrix-stage1 feature-matrix-stage2 labels-dummies


In [4]:
FEATURES = [
    'FTE', 'Facility_or_Department', 'Function_Description', 'Fund_Description',
    'Job_Title_Description', 'Location_Description', 'Object_Description',
    'Position_Extra', 'Program_Description', 'SubFund_Description',
    'Sub_Object_Description', 'Text_1', 'Text_2', 'Text_3', 'Text_4', 'Total'
]

NUMERICAL_FEATURES = [
    'FTE', 'Total'
]

TEXT_FEATURES = [
    'Facility_or_Department', 'Function_Description', 'Fund_Description',
    'Job_Title_Description', 'Location_Description', 'Object_Description',
    'Position_Extra', 'Program_Description', 'SubFund_Description',
    'Sub_Object_Description', 'Text_1', 'Text_2', 'Text_3', 'Text_4',
]
LABELS = [
    'Function', 'Object_Type', 'Operating_Status', 'Position_Type',
    'Pre_K', 'Reporting', 'Sharing', 'Student_Type', 'Use'
]

In [24]:
X = pd.read_feather(PATH / 'feature-matrix-stage2').set_index('index')
X.index.name = ''
y = pd.read_feather(PATH / 'labels-dummies').set_index('index')
X.index.name = ''
X.head()

Unnamed: 0,FTE,Total,text
,,,
134338.0,1.0,50471.81,General Fund Teacher-Elementary KINDERGAR...
206341.0,,3477.86,RGN GOB (blank) CONTRACTOR SERVICES UNDESI...
326408.0,1.0,62237.13,General Purpose School TCHER 2ND GRADE Pers...
364634.0,,22.3,"UNALLOC BUDGETS/SCHOOLS Teacher, Short Term ..."
47683.0,,54.166,"NON-PROJECT Teacher, Secondary (High) TEAC..."


In [25]:
y.head()

Unnamed: 0_level_0,Function__Aides Compensation,Function__Career & Academic Counseling,Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,Function__Facilities Planning,"Function__Finance, Budget, Purchasing & Distribution",Function__Food Services,Function__Governance,Function__Human Resources,Function__Instructional Materials & Supplies,Function__Insurance,Function__Legal,Function__Library & Media,Function__NO_LABEL,Function__Other Compensation,Function__Other Non-Compensation,Function__Parent & Community Relations,Function__Physical Health & Services,Function__Professional Development,Function__Recruitment,Function__Research & Accountability,Function__School Administration,Function__School Supervision,Function__Security & Safety,Function__Social & Emotional,Function__Special Population Program Management & Support,Function__Student Assignment,Function__Student Transportation,Function__Substitute Compensation,Function__Teacher Compensation,Function__Untracked Budget Set-Aside,Function__Utilities,Object_Type__Base Salary/Compensation,Object_Type__Benefits,Object_Type__Contracted Services,Object_Type__Equipment & Equipment Lease,Object_Type__NO_LABEL,Object_Type__Other Compensation/Stipend,Object_Type__Other Non-Compensation,Object_Type__Rent/Utilities,Object_Type__Substitute Compensation,Object_Type__Supplies/Materials,Object_Type__Travel & Conferences,Operating_Status__Non-Operating,"Operating_Status__Operating, Not PreK-12",Operating_Status__PreK-12 Operating,Position_Type__(Exec) Director,Position_Type__Area Officers,Position_Type__Club Advisor/Coach,Position_Type__Coordinator/Manager,Position_Type__Custodian,Position_Type__Guidance Counselor,Position_Type__Instructional Coach,Position_Type__Librarian,Position_Type__NO_LABEL,Position_Type__Non-Position,Position_Type__Nurse,Position_Type__Nurse Aide,Position_Type__Occupational Therapist,Position_Type__Other,Position_Type__Physical Therapist,Position_Type__Principal,Position_Type__Psychologist,Position_Type__School Monitor/Security,Position_Type__Sec/Clerk/Other Admin,Position_Type__Social Worker,Position_Type__Speech Therapist,Position_Type__Substitute,Position_Type__TA,Position_Type__Teacher,Position_Type__Vice Principal,Pre_K__NO_LABEL,Pre_K__Non PreK,Pre_K__PreK,Reporting__NO_LABEL,Reporting__Non-School,Reporting__School,Sharing__Leadership & Management,Sharing__NO_LABEL,Sharing__School Reported,Sharing__School on Central Budgets,Sharing__Shared Services,Student_Type__Alternative,Student_Type__At Risk,Student_Type__ELL,Student_Type__Gifted,Student_Type__NO_LABEL,Student_Type__Poverty,Student_Type__PreK,Student_Type__Special Education,Student_Type__Unspecified,Use__Business Services,Use__ISPD,Use__Instruction,Use__Leadership,Use__NO_LABEL,Use__O&M,Use__Pupil Services & Enrichment,Use__Untracked Budget Set-Aside
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1
134338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
206341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
326408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
364634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
47683,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0


In [38]:
new_labels = y.columns

In [40]:
indices_df = pd.DataFrame({'labels': new_labels})
labels_indices = {}
for k in LABELS:
    indices = indices_df[indices_df.labels.str.startswith(k)].index
    labels_indices[k] = range(indices.min(), indices.max() + 1)

# Sample Data

We will be using 10% of the data to faster iterations between models. Then 80% of those data will be for training and 20% for validation. Once we are happy with the model, we can refit it with the full training data from first step and do the submissions

In [32]:
X_train, X_valid, y_train, y_valid = utils.multilabel_train_test_split(X, y,
                                                                     size=0.1,
                                                                     min_count=5)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((360250, 3), (360250, 104), (40027, 3), (40027, 104))

In [33]:
X_train, X_valid, y_train, y_valid = utils.multilabel_train_test_split(X_valid, y_valid,
                                                                     size=0.2,
                                                                     min_count=5)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((32022, 3), (32022, 104), (8005, 3), (8005, 104))

# Models

## Model with CountVectorizer

In [35]:
# Use column transfer to have apply different processing to
# different features
preprocess_pip = ColumnTransformer([
    ('numeric', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ]), NUMERICAL_FEATURES),
    ('text', Pipeline([
        ('vectorizer', CountVectorizer(token_pattern='[a-zA-Z0-9]+(?=\\s+)',
                                       ngram_range=(1, 2)))
    ]), 'text')
])

In [36]:
# Create a pipeline
clf_pip = Pipeline([
    ('preprocess', preprocess_pip),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs')))
])
clf_pip.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', ver...e=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [41]:
preds = clf_pip.predict_proba(X_valid)
preds_df = pd.DataFrame(data=preds, columns=new_labels, index=X_valid.index)
utils.score(preds_df, y_valid, labels_indices)

({'Function': 0.2572355743633417,
  'Object_Type': 0.08596722909183901,
  'Operating_Status': 0.08237535858177053,
  'Position_Type': 0.14132249755224868,
  'Pre_K': 0.04548361022711255,
  'Reporting': 0.11421006548353846,
  'Sharing': 0.1619726829558784,
  'Student_Type': 0.11742291315091484,
  'Use': 0.20147123287607543},
 0.13416235158696888)

## Model with CountVectorizer and KBest

In [47]:
# Use column transfer to have apply different processing to
# different features
preprocess_pip = ColumnTransformer([
    ('numeric', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ]), NUMERICAL_FEATURES),
    ('text', Pipeline([
        ('vectorizer', CountVectorizer(token_pattern='[a-zA-Z0-9]+(?=\\s+)',
                                       ngram_range=(1, 2))),
        ('dim_reduction', SelectKBest(chi2, 300))
    ]), 'text')
])

In [48]:
# Create a pipeline
clf_pip = Pipeline([
    ('preprocess', preprocess_pip),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='lbfgs')))
])
clf_pip.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', ver...e=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [49]:
preds = clf_pip.predict_proba(X_valid)
preds_df = pd.DataFrame(data=preds, columns=new_labels, index=X_valid.index)
utils.score(preds_df, y_valid, labels_indices)

({'Function': 0.7048236992299501,
  'Object_Type': 0.3343551008388288,
  'Operating_Status': 0.1653124039375883,
  'Position_Type': 0.5191189198626318,
  'Pre_K': 0.12004188570089643,
  'Reporting': 0.35038044452002426,
  'Sharing': 0.46479290036366566,
  'Student_Type': 0.3530041320278832,
  'Use': 0.5299454618257322},
 0.3935305498119112)

## Model with CountVectorizer and KBest, RF

In [61]:
# Use column transfer to have apply different processing to
# different features
preprocess_pip = ColumnTransformer([
    ('numeric', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ]), NUMERICAL_FEATURES),
    ('text', Pipeline([
        ('vectorizer', CountVectorizer(token_pattern='[a-zA-Z0-9]+(?=\\s+)',
                                       ngram_range=(1, 2)))#,
#         ('dim_reduction', SelectKBest(chi2, 300))
    ]), 'text')
])

In [62]:
# Create a pipeline
clf_pip = Pipeline([
    ('preprocess', preprocess_pip),
    ('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=50,
                                                       max_features=0.25,
                                                       min_samples_leaf=5)))
])
clf_pip.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', ver...b_score=False, random_state=None, verbose=0,
            warm_start=False),
          n_jobs=None))])

In [63]:
preds = clf_pip.predict_proba(X_valid)
preds_df = pd.DataFrame(data=preds, columns=new_labels, index=X_valid.index)
utils.score(preds_df, y_valid, labels_indices)

({'Function': 0.41346258427345967,
  'Object_Type': 0.1033858206471625,
  'Operating_Status': 0.08852676916583312,
  'Position_Type': 0.23224683095647836,
  'Pre_K': 0.04920368553450598,
  'Reporting': 0.13318878061534695,
  'Sharing': 0.20770508002407007,
  'Student_Type': 0.13132071206461257,
  'Use': 0.2654980489379407},
 0.1805042569132678)

In [64]:
utils.make_submission(clf_pip, PATH.parent / 'raw/test_data.csv', TEXT_FEATURES,
                      NUMERICAL_FEATURES, new_labels, title='submission-4')

# Test

In [66]:
# Use column transfer to have apply different processing to
# different features
preprocess_pip = ColumnTransformer([
    ('numeric', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ]), NUMERICAL_FEATURES),
    ('text', Pipeline([
        ('vectorizer', CountVectorizer(token_pattern='[a-zA-Z0-9]+(?=\\s+)',
                                       ngram_range=(1, 2))),
        ('dim_reduction', SelectKBest(chi2, 300))
    ]), 'text')
])

In [69]:
# Create a pipeline
clf_pip = Pipeline([
    ('preprocess', preprocess_pip),
    ('clf', OneVsRestClassifier(LogisticRegression(C=1.5, penalty='l1', fit_intercept=True)))
])
clf_pip.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocess', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('numeric', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', ver...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [70]:
preds = clf_pip.predict_proba(X_valid)
preds_df = pd.DataFrame(data=preds, columns=new_labels, index=X_valid.index)
utils.score(preds_df, y_valid, labels_indices)

({'Function': 0.65527004626855,
  'Object_Type': 0.29729544310758826,
  'Operating_Status': 0.15367071760062817,
  'Position_Type': 0.47688401526589713,
  'Pre_K': 0.11016384825495269,
  'Reporting': 0.31459008507274316,
  'Sharing': 0.4233689782937825,
  'Student_Type': 0.3153998610729189,
  'Use': 0.4775899336948185},
 0.3582481031813199)