# Intro

This notebook is used to train classifiers to predict used memory, and then save the trained model along with the data it used. This includes a decision tree, a random forest, a gradient boosted decision tree model, and a logistic regressor.

# Notebook options

The grid_search parameter determines whether the notebook will execute a grid search for each model to find the best combination of hyperparameters, or whether the notebook will just use some pre-set parameters (which were found in previous grid searches).

The save_model parameter determines whether to save a model at the end (currently it saves the random forest model).

In [45]:
grid_search = False
save_model = True

# Imports

In [2]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from hypopt import GridSearch
from datetime import datetime
from datetime import date
from preprocessing import get_df
from preprocessing import scale
from persistence import save_trained_model
from evaluation import score_model
from evaluation import print_feature_importances
from evaluation import plot_performance
from evaluation import plot_cm
from evaluation import print_cr
from evaluation import auc

In [3]:
from sklearn.metrics import make_scorer
import itertools

In [4]:
df = get_df()

In [5]:
mem_bin_cutoffs = [50, 100, 200, 500, 1000, 2000, 10000, 20000, 50000, 100000]
num_classes = len(mem_bin_cutoffs)
bin_sizes = [df[df.used_mem < mem_bin_cutoffs[0]].shape[0]]
for i in range(1, len(mem_bin_cutoffs)):
    current_bin_size = df[(df.used_mem >= mem_bin_cutoffs[i-1])
                        & (df.used_mem < mem_bin_cutoffs[i])].shape[0]
    bin_sizes.append(current_bin_size)

In [6]:
def print_mem_categories():
    print(f"0: < {mem_bin_cutoffs[0]}MB ({bin_sizes[0]} entries)")
    for i in range(1, len(mem_bin_cutoffs)):
        print(f"{i}: >= {mem_bin_cutoffs[i-1]}MB"
              + f" and < {mem_bin_cutoffs[i]}MB"
              + f" ({bin_sizes[i]} entries)")

In [7]:
print_mem_categories()

0: < 50MB (22965 entries)
1: >= 50MB and < 100MB (11126 entries)
2: >= 100MB and < 200MB (8623 entries)
3: >= 200MB and < 500MB (3169 entries)
4: >= 500MB and < 1000MB (1639 entries)
5: >= 1000MB and < 2000MB (2535 entries)
6: >= 2000MB and < 10000MB (248 entries)
7: >= 10000MB and < 20000MB (33 entries)
8: >= 20000MB and < 50000MB (22 entries)
9: >= 50000MB and < 100000MB (0 entries)


In [8]:
def mem_category(row):
    mem = row['used_mem']
    if mem < mem_bin_cutoffs[0]:
        return 0
    for i in range(1, len(mem_bin_cutoffs)):
        if mem >= mem_bin_cutoffs[i-1] and mem < mem_bin_cutoffs[i]:
            return i
    return None

In [9]:
df['mem_category'] = df.apply(lambda row: mem_category(row), axis=1)

In [10]:
df = df[df.rqst_timespan.notnull() & df.rqst_area_rect.notnull()]
X_features = ['PP', 'SP', 'BR', 'rqst_timespan', 'rqst_area_rect', 'converted',
              'params_num', 'grid_def_num', 'level_num',
              'ds084.1', 'ds631.1', 'ds083.3', 'ds094.0', 'ds083.2']
y_features = ['req_mem', 'used_mem', 'mem_category']

X = df[X_features]
y = df[y_features]

## Train/validation/test split

In [11]:
train_amt = 0.5
val_amt = 0.25
test_amt = 0.25

In [12]:
X_train, X_target, y_train_full, y_target_full = \
                train_test_split(X, y, 
                test_size=1-train_amt, 
                random_state = 3)
X_val, X_test, y_val_full, y_test_full = \
                train_test_split(X_target, y_target_full,
                                 test_size = test_amt/train_amt,
                                 random_state = 3)

In [13]:
y_train = np.ravel(y_train_full['mem_category'])
y_val = np.ravel(y_val_full['mem_category'])
y_test = np.ravel(y_test_full['mem_category'])

## Scaling

In [14]:
X_train_norm, X_val_norm, X_test_norm = \
    scale(X_train, X_val, X_test)

In [15]:
def balanced_score(y, y_pred, **kwargs):
    n_samples = X_train.shape[0]
    class_weights = n_samples / ((num_classes - 1) * (np.bincount(y_train)))
    
    score = 0
    for i in range(len(y)):
        if y[i] == y_pred[i]:
            score += class_weights[y[i]]
        
    return score

balanced_scorer = make_scorer(score_func=balanced_score,
                              greater_is_better=True)

## Trees, Forest, Gradient Boosts

### Tree

In [16]:
if grid_search:
    tree_param_grid = {'random_state':[3], 'max_depth':range(2,20),
                       'class_weight':['balanced']}
    tree_gs = GridSearch(model=DecisionTreeClassifier(), 
                              param_grid=tree_param_grid,
                              parallelize=False)
    tree_gs.fit(X_train_norm, y_train, X_val_norm, y_val, scoring=balanced_scorer)
    print(tree_gs.best_params)
    tree = tree_gs.best_estimator_

else:
    tree = DecisionTreeClassifier(class_weight='balanced',
                                  max_depth=19, random_state=3)
    tree.fit(X_train_norm, y_train)

### Forest

In [17]:
if grid_search:
    forest_param_grid = {'random_state':[3],
                               'max_depth':range(2,15),
                               'min_samples_split':range(2,8),
                               'n_estimators':[100,200,300,500],
                               'class_weight':['balanced', 'balanced_subsample']}
    forest_gs = GridSearch(model=RandomForestClassifier(), 
                                param_grid=forest_param_grid,
                                parallelize=False)
    forest_gs.fit(X_train_norm, y_train, X_val_norm, y_val, scoring=balanced_scorer)
    print(forest_gs.best_params)
    forest = forest_gs.best_estimator_
else:
    forest = RandomForestClassifier(class_weight='balanced',
                                      max_depth=12,
                                      min_samples_split=5,
                                      n_estimators=100,
                                      random_state=3)
    forest.fit(X_train_norm, y_train)

### Gradient boosted trees

In [18]:
if grid_search:
    gboost_param_grid = {'random_state':[3],
                               'max_depth':range(2,15),
                               'n_estimators':[100,200,300,500]}
    gboost_gs = GridSearch(model=GradientBoostingClassifier(), 
                                param_grid=gboost_param_grid,
                                parallelize=False)
    gboost_gs.fit(X_train_norm, y_train, X_val_norm, y_val)
    print(class_gboost_gs.best_params)
    class_gboost = class_gboost_gs.best_estimator_
else:
    gboost = GradientBoostingClassifier(random_state=3,
                                        max_depth=14,
                                        n_estimators=150)
    gboost.fit(X_train_norm, y_train)

## Linear regressors

In [19]:
logist = LogisticRegression(max_iter=1000, class_weight='balanced')
logist.fit(X_train_norm, y_train)

LogisticRegression(class_weight='balanced', max_iter=1000)

In [25]:
if save_model:
    save_trained_model('class_forest', 
                   forest, df, 
                   X_features,
                   X_train_norm, y_train_full,
                   X_val_norm, y_val_full,
                   X_test_norm, y_test_full)