## It's time to build a model
- Always a good approach to start with a simple model
- Gives us a sense of how challenging a problem is
- Many more things can go wrong in complex models
- How much signal can we pull out using basic methods?
- Train basic model on numeric data only 
 - We want to go from raw data to predictions quickly
- Multi-class logistic regression
 - Train classifier on each label seperately and use those to predict
- Format predictions and save to csv 
- Compute a log loss score

### Splitting the multi-class dataset
- Recall: Train-test split
 - Will not work here
 - If we split our dataset randomly, we may end up with labels in our test set that never appeared in our training set.
- Solution: StratifiedShuffleSplit
 - Only works with a single target variable
 - We have many target variables
 - `multilabel_train_test_split()`

In [1]:
import pandas as pd 
import numpy as np

df = pd.read_csv('TrainingData.csv', index_col=0)
df.describe()

Unnamed: 0,FTE,Total
count,126071.0,395722.0
mean,0.426794,13105.86
std,0.573576,368225.4
min,-0.087551,-87466310.0
25%,0.000792,73.7977
50%,0.130927,461.23
75%,1.0,3652.662
max,46.8,129700000.0


In [2]:
NUMERIC_COLUMNS = ['FTE', 'Total']

In [3]:
data_to_train = df[NUMERIC_COLUMNS].fillna(-1000)

In [4]:
LABELS = ['Function',
 'Use',
 'Sharing',
 'Reporting',
 'Student_Type',
 'Position_Type',
 'Object_Type',
 'Pre_K',
 'Operating_Status']

In [5]:
labels_to_use = pd.get_dummies(df[LABELS])

In [6]:
from warnings import warn


def multilabel_sample(y, size=1000, min_count=5, seed=None):
    """ Takes a matrix of binary labels `y` and returns
        the indices for a sample of size `size` if
        `size` > 1 or `size` * len(y) if size =< 1.
        The sample is guaranteed to have > `min_count` of
        each label.
    """
    try:
        if (np.unique(y).astype(int) != np.array([0, 1])).any():
            raise ValueError()
    except (TypeError, ValueError):
        raise ValueError('multilabel_sample only works with binary indicator matrices')

    if (y.sum(axis=0) < min_count).any():
        raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')

    if size <= 1:
        size = np.floor(y.shape[0] * size)

    if y.shape[1] * min_count > size:
        msg = "Size less than number of columns * min_count, returning {} items instead of {}."
        warn(msg.format(y.shape[1] * min_count, size))
        size = y.shape[1] * min_count

    rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))

    if isinstance(y, pd.DataFrame):
        choices = y.index
        y = y.values
    else:
        choices = np.arange(y.shape[0])

    sample_idxs = np.array([], dtype=choices.dtype)

    # first, guarantee > min_count of each label
    for j in range(y.shape[1]):
        label_choices = choices[y[:, j] == 1]
        label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
        sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])

    sample_idxs = np.unique(sample_idxs)

    # now that we have at least min_count of each, we can just random sample
    sample_count = int(size - sample_idxs.shape[0])

    # get sample_count indices from remaining choices
    remaining_choices = np.setdiff1d(choices, sample_idxs)
    remaining_sampled = rng.choice(remaining_choices,
                                   size=sample_count,
                                   replace=False)

    return np.concatenate([sample_idxs, remaining_sampled])


def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
    """ Takes a dataframe `df` and returns a sample of size `size` where all
        classes in the binary matrix `labels` are represented at
        least `min_count` times.
    """
    idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
    return df.loc[idxs]


def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
    """ Takes a features matrix `X` and a label matrix `Y` and
        returns (X_train, X_test, Y_train, Y_test) where all
        classes in Y are represented at least `min_count` times.
    """
    index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])

    test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
    train_set_idxs = np.setdiff1d(index, test_set_idxs)

    test_set_mask = index.isin(test_set_idxs)
    train_set_mask = ~test_set_mask

    return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])

In [7]:
X_train, X_test, y_train, y_test = multilabel_train_test_split(
                                                    data_to_train,
                                                    labels_to_use,
                                                    size=0.2, seed=123)

OneVsRestClassifier
 - Treats each column of y independently
 - Fits a seperate classifier for each of the columns

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(LogisticRegression())

In [9]:
# Create the new DataFrame: numeric_data_only
numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)

# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])

# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
                                                    label_dummies,
                                                    size=0.2, seed=123)

# Print the info
print("X_train info:")
print(X_train.info())
print("\nX_test info:")  
print(X_test.info())
print("\ny_train info:")  
print(y_train.info())
print("\ny_test info:")  
print(y_test.info()) 

X_train info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320222 entries, 134338 to 415831
Data columns (total 2 columns):
FTE      320222 non-null float64
Total    320222 non-null float64
dtypes: float64(2)
memory usage: 7.3 MB
None

X_test info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80055 entries, 206341 to 72072
Data columns (total 2 columns):
FTE      80055 non-null float64
Total    80055 non-null float64
dtypes: float64(2)
memory usage: 1.8 MB
None

y_train info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320222 entries, 134338 to 415831
Columns: 104 entries, Function_Aides Compensation to Operating_Status_PreK-12 Operating
dtypes: uint8(104)
memory usage: 34.2 MB
None

y_test info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 80055 entries, 206341 to 72072
Columns: 104 entries, Function_Aides Compensation to Operating_Status_PreK-12 Operating
dtypes: uint8(104)
memory usage: 8.6 MB
None


In [10]:
%%time
clf.fit(X_train, y_train)

CPU times: user 8min 23s, sys: 2 s, total: 8min 25s
Wall time: 8min 1s


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [11]:
%%time 
# Print the accuracy
print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.0
CPU times: user 1.31 s, sys: 72 ms, total: 1.39 s
Wall time: 1.09 s


### Making predictions

In [12]:
holdout = pd.read_csv('TestData.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
holdout = holdout[NUMERIC_COLUMNS].fillna(-1000)
holdout.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50064 entries, 180042 to 249087
Data columns (total 2 columns):
FTE      50064 non-null float64
Total    50064 non-null float64
dtypes: float64(2)
memory usage: 1.1 MB


In [14]:
predictions = clf.predict_proba(holdout)
predictions

array([[3.58422797e-02, 6.46624377e-03, 8.29891300e-04, ...,
        1.69612500e-01, 1.99296715e-02, 8.10543000e-01],
       [3.58482728e-02, 6.46610320e-03, 8.29902557e-04, ...,
        1.69607057e-01, 1.99300220e-02, 8.10552551e-01],
       [1.20946821e-01, 9.06528221e-03, 1.53268023e-03, ...,
        9.59263311e-02, 5.10388015e-02, 9.28396081e-01],
       ...,
       [1.22222570e-01, 9.05175340e-03, 1.53411191e-03, ...,
        9.56957120e-02, 5.10986918e-02, 9.28680377e-01],
       [1.22275131e-01, 9.04893421e-03, 1.53377914e-03, ...,
        9.57019699e-02, 5.10808744e-02, 9.28670860e-01],
       [1.22159718e-01, 9.05015147e-03, 1.53365017e-03, ...,
        9.57227211e-02, 5.10754795e-02, 9.28645295e-01]])

### If .predict() was used instead:
 - Output would be 0 or 1
 - Log loss penalizees being confident and wrong
 - Worse performance compared to .predict_proba()

### Submitting your predictions as a csv
- All formatting can be done with the pandas to_csv function

In [15]:
predictions_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS],
                               prefix_sep='__',).columns,
                               index=holdout.index,
                               data=predictions)

In [16]:
predictions_df.head()

Unnamed: 0,Function__Aides Compensation,Function__Career & Academic Counseling,Function__Communications,Function__Curriculum Development,Function__Data Processing & Information Services,Function__Development & Fundraising,Function__Enrichment,Function__Extended Time & Tutoring,Function__Facilities & Maintenance,Function__Facilities Planning,...,Object_Type__Rent/Utilities,Object_Type__Substitute Compensation,Object_Type__Supplies/Materials,Object_Type__Travel & Conferences,Pre_K__NO_LABEL,Pre_K__Non PreK,Pre_K__PreK,Operating_Status__Non-Operating,"Operating_Status__Operating, Not PreK-12",Operating_Status__PreK-12 Operating
180042,0.035842,0.006466,0.00083,0.023918,0.008916,0.000173,0.032077,0.024406,0.052099,4.8e-05,...,0.010729,0.036846,0.116126,0.01736,0.831241,0.141031,0.027749,0.169612,0.01993,0.810543
28872,0.035848,0.006466,0.00083,0.023919,0.008916,0.000173,0.032078,0.024406,0.052102,4.8e-05,...,0.010728,0.036959,0.116164,0.017361,0.831233,0.141041,0.027751,0.169607,0.01993,0.810553
186915,0.120947,0.009065,0.001533,0.028599,0.016042,0.01815,0.043858,0.031715,0.113907,0.017293,...,0.005622,0.136221,0.135391,0.016041,0.501655,0.472173,0.098601,0.095926,0.051039,0.928396
412396,0.120381,0.009071,0.001532,0.028573,0.016044,0.01812,0.043808,0.031688,0.113723,0.017261,...,0.00563,0.125189,0.134056,0.016029,0.502143,0.471525,0.098399,0.096029,0.051012,0.928269
427740,0.121725,0.009057,0.001534,0.028634,0.016038,0.01819,0.043926,0.031752,0.114158,0.017338,...,0.005612,0.152629,0.137236,0.016059,0.500987,0.473061,0.098879,0.095785,0.051075,0.92857


In [17]:
predictions_df.to_csv('predictions.csv')

### A very brief introduction to NLP
- Data for NLP:
 - Text, documents, speech...
- Tokenization
 - Splitting a string into segments
 - Store segmments as list
- Example: 'Natural Language Processing' 
 - --> ['Natural', 'Language', 'Processing']

### Tokens and token patterns
- Tokenize on whitespace
 - **PETRO-VEND FUEL AND FLUIDS**
 - **PETRO-VEND | FUEL | AND | FLUIDS**

### Bag of words representation
- Count the number of times a particualar token appears
- "Bag of words"
- This approach discards information about word order

### Representing text numerically
- Bag of words 
 - Simple way to represent text in machine learning
 - Discards information about grammar and word order
 - Computes frequency of occurrence
- CountVectorizer()
 - Tokenizes all the strings
 - Builds a 'vocabulary' 
 - Counts the occurences of each token in the vocabulary

In [19]:
df.Program_Description.head()

134338                      KINDERGARTEN
206341     BUILDING IMPROVEMENT SERVICES
326408             Instruction - Regular
364634    GENERAL MIDDLE/JUNIOR HIGH SCH
47683      GENERAL HIGH SCHOOL EDUCATION
Name: Program_Description, dtype: object

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

TOKENS_BASIC = '\\S+(?=\\s+)'

df.Program_Description.fillna('', inplace=True)

vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

In [21]:
vec_basic.fit(df.Program_Description)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\S+(?=\\s+)', tokenizer=None,
        vocabulary=None)

In [22]:
msg = 'There are {} tokens in Program_Description if tokens are any non-whitespace'

In [24]:
print(msg.format(len(vec_basic.get_feature_names())))

There are 434 tokens in Program_Description if tokens are any non-whitespace


In [26]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Fill missing values in df.Position_Extra
df.Position_Extra.fillna('', inplace=True)

# Instantiate the CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit to the data
vec_alphanumeric.fit(df.Position_Extra)

# Print the number of tokens and first 15 tokens
msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric"
print(msg.format(len(vec_alphanumeric.get_feature_names())))
print(vec_alphanumeric.get_feature_names()[:15])

There are 385 tokens in Position_Extra if we split on non-alpha numeric
['1st', '2nd', '3rd', '4th', '56', '5th', '9th', 'a', 'ab', 'accountability', 'adaptive', 'addit', 'additional', 'adm', 'admin']


In [28]:
# Define combine_text_columns()
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [32]:
%%time
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the basic token pattern
TOKENS_BASIC = '\\S+(?=\\s+)'

# Create the alphanumeric token pattern
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate basic CountVectorizer: vec_basic
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

# Instantiate alphanumeric CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Create the text vector
text_vector = combine_text_columns(df)

# Fit and transform vec_basic
vec_basic.fit_transform(text_vector)

# Print number of tokens of vec_basic
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))

# Fit and transform vec_alphanumeric
vec_alphanumeric.fit_transform(text_vector)

# Print number of tokens of vec_alphanumeric
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))

There are 4757 tokens in the dataset
There are 3284 alpha-numeric tokens in the dataset
CPU times: user 41.7 s, sys: 556 ms, total: 42.3 s
Wall time: 42.1 s
