# Useful Functions:

In [57]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [58]:
X = pd.DataFrame()

In [59]:
# Here is a simple function to show descriptive stats on the categorical variables
def describe_categorical(X):
    """
    Just like .describe(), but returns the results for
    categorical variables only.
    """
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == "object"]].describe().to_html()))

In [60]:
# For a df named 'X', this function will fillna all missing values with the mean across the entire data set.
def fix_numerical_missings(numerical_variables):
    for var in numerical_variables:
        X[var].fillna(X[var].mean(), inplace=True)

In [61]:
# For a df named 'X', this function will mark all categorical missing values with "Missing" across the entire data set.
# and also will one hot encode the variables using 'get_dummies'. This function is applied to the entire df 'X'

def fix_categorical_missings(categorical_variables):
    for variable in categorical_variables:
        # Fill missing data with the word "Missing"
        X[variable].fillna("Missing", inplace=True)
        # Create array of dummies
        dummies = pd.get_dummies(X[variable], prefix=variable)
        # Update X to include dummies and drop the main variable
        X = pd.concat([X, dummies], axis=1)
        X.drop([variable], axis=1, inplace=True)

In [62]:
# This will clean a categorical, by replacing every variable with only the first letter, or "None".
# This can be a very useful tactic when applied to variables such as 'cabin' from the Titanic dataset.

# Change the Cabin variable to be only the first letter or None

def clean_first_letter_only(X, var_name):
    try:
        return x[0]
    except TypeError:
        return "None"

    X[var_name] = X[var_name].apply(clean_first_letter_only)

In [63]:
# This function will take a feature name, a string or char to replace, and what to replace it with. Entire column.
# Note: this is set with '.astype(float)' , adjust this to match the feature data type.

def strip_replacein_feature(var_name, string_to_replace, replace_with_string):
    X[var_name] = X[var_name].str.replace(string_to_replace , replace_with_string).astype(float)

In [64]:
# Look at all the columns in the dataset
def printall(X, max_rows=10):
    from IPython.display import display, HTML
    display(HTML(X.to_html(max_rows=max_rows)))

In [65]:
def graph_variable_importance(model):
    # Simple version that shows all of the variables
    feature_importances = pd.Series(model.feature_importances_, index=X.columns)
    feature_importances.sort_values(inplace=True)
    feature_importances.plot(kind="barh", figsize=(7,6));

In [66]:
# For a df named 'X', returns a list of only the numerical feature labels.
def get_numerical_only():
    # numeric variables
    numerical_variables = list(X.dtypes[X.dtypes != "object"].index)
    return numerical_variables

In [67]:
# For a df named 'X', returns a list of only the categorical feature labels.
def get_categoricals_only():
    categorical_variables = list(X.dtypes[X.dtypes == 'object'].index)
    X[categorical_variables].shape
    return categorical_variables

In [68]:
# Manual parameter tuning for RandomForestRegressor() -- n_estimators
def get_best_njobs():
    results = []
    n_estimator_options = [30, 50, 100, 200, 500, 1000, 2000]

    for trees in n_estimator_options:
        model = RandomForestRegressor(trees, oob_score=True, n_jobs=-1, random_state=42)
        model.fit(X, y)
        print (trees, "trees")
        roc = roc_auc_score(y, model.oob_prediction_)
        print ("C-stat: ", roc)
        results.append(roc)
        print ("")
    
    pd.Series(results, n_estimator_options).plot();

In [69]:
# Manual parameter tuning for RandomForestRegressor() -- max_features
def get_best_max_features():
    results = []
    max_features_options = ["auto", None, "sqrt", "log2", 0.9, 0.2]

    for max_features in max_features_options:
        model = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs=-1, random_state=42, max_features=max_features)
        model.fit(X, y)
        print (max_features, "option")
        roc = roc_auc_score(y, model.oob_prediction_)
        print ("C-stat: ", roc)
        results.append(roc)
        print ("")
    
    pd.Series(results, max_features_options).plot(kind="barh", xlim=(.85,.88));

In [70]:
# Manual parameter tuning for RandomForestRegressor() -- min_samples_leaf
def get_best_min_samples_leaf():
    results = []
    min_samples_leaf_options = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

    for min_samples in min_samples_leaf_options:
        model = RandomForestRegressor(n_estimators=1000, 
                                      oob_score=True, 
                                      n_jobs=-1, 
                                      random_state=42, 
                                      max_features="auto", 
                                      min_samples_leaf=min_samples)
        model.fit(X, y)
        print (min_samples, "min samples")
        roc = roc_auc_score(y, model.oob_prediction_)
        print ("C-stat: ", roc)
        results.append(roc)
        print ("")
    
    pd.Series(results, min_samples_leaf_options).plot();

In [71]:
# A much more complex version of plotting for feature importances, with aggregated view.
# Provided by Mike Bernico -- only verified to work with RandomTreeRegressor() models.

def graph_feature_importances(model, feature_names, autoscale=True, headroom=0.05, width=10, summarized_columns=None):
    """
    By Mike Bernico
    
    Graphs the feature importances of a random decision forest using a horizontal bar chart. 
    Probably works but untested on other sklearn.ensembles.
    
    Parameters
    ----------
    ensemble = Name of the ensemble whose features you would like graphed.
    feature_names = A list of the names of those featurs, displayed on the Y axis.
    autoscale = True (Automatically adjust the X axis size to the largest feature +.headroom) / False = scale from 0 to 1
    headroom = used with autoscale, .05 default
    width=figure width in inches
    summarized_columns = a list of column prefixes to summarize on, for dummy variables (e.g. ["day_"] would summarize all day_ vars
    """
    
    if autoscale:
        x_scale = model.feature_importances_.max()+ headroom
    else:
        x_scale = 1
    
    feature_dict=dict(zip(feature_names, model.feature_importances_))
    
    if summarized_columns: 
        #some dummy columns need to be summarized
        for col_name in summarized_columns: 
            #sum all the features that contain col_name, store in temp sum_value
            sum_value = sum(x for i, x in feature_dict.items() if col_name in i )  
            
            #now remove all keys that are part of col_name
            keys_to_remove = [i for i in feature_dict.keys() if col_name in i ]
            for i in keys_to_remove:
                feature_dict.pop(i)
            #lastly, read the summarized field
            feature_dict[col_name] = sum_value
        
    results = pd.Series(feature_dict)
    results.sort_values(inplace=True)
    results.plot(kind="barh", figsize=(width,len(results)/4), xlim=(0,x_scale))
    
    graph_feature_importances(model, X.columns, summarized_columns=categorical_variables)

In [72]:
# Get c-stat for a RandomForestRegressor
def get_cstat():
    roc = roc_auc_score(y, model.oob_prediction_)
    return roc

Parameters to test

 * ### Parameters that will make your model better
  * <b>n_estimators</b>: The number of trees in the forest. Choose as high of a number as your computer can handle.
  * <b>max_features</b>: The number of features to consider when looking for the best split. Try ["auto", "None", "sqrt", "log2", 0.9, and 0.2]
  * <b>min_samples_leaf</b>: The minimum number of samples in newly created leaves.Try [1, 2, 3]. If 3 is the best, try higher numbers such as 1 through 10.
 * ### Parameters that will make it easier to train your model
  * <b>n_jobs</b>: Determines if multiple processors should be used to train and test the model. Always set this to -1 and %%timeit vs. if it is set to 1. It should be much faster (especially when many trees are trained).

In [75]:
# Params: list form ['v1','v2','v3'], and contain the possible values of parameters to consider in a grid search
# with n_job=-1 (full processing power). This returns the estimator object to be stored in memory. 
# Ex: estimator = gridSearchRFC(n_estimators_param, max_features_param, min_samples_split_param, min_samples_leaf_param)

def gridSearchRFC(n_estimators_param, max_features_param, min_samples_split_param, min_samples_leaf_param):

    ### Grid Search
    n_estimators = [n_estimators_param]
    max_features = [max_features_param]
    min_samples_split = [min_samples_split_param]
    min_samples_leaf = [min_samples_leaf_param]

    rfc = RandomForestClassifier(n_jobs=-1)
    
    estimator = GridSearchCV(rfc,
                             dict(n_estimators=n_estimators,
                                  max_features=max_features,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf
                                  ), cv=None, n_jobs=-1)
    estimator.fit(X, y)
    return estimator

In [81]:
'''
K-Fold Cross Validation to use on best_rfc model from gridsearch: best_rfc = estimator.best_estimator_
Param: Pass this function the number of K-Folds to use, that is how many "chunks" to break the data into. 
If unsure, use K = 10. Function returns the confidence interval at 95% probability.
Required import: from sklearn import cross_validation
Note: 2.262 = the value for 95% c.i.

***********************************************************************************************************************
Setup:
Prior to run ensure to follow a similar setup as below...

Ex prepartion code:

data = pd.read_csv("somedata.csv")
y = data.pop("y_col_name")
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.2, random_state=42)
estimator = gridSearchRFC(n_estimators_param, max_features_param, min_samples_split_param, min_samples_leaf_param)
best_rfc = estimator.best_estimator_
get_kfolds_ci(10)
***********************************************************************************************************************
'''


def get_kfolds_ci(num_k_folds):
    scores = cross_validation.cross_val_score(best_rfc, data, y, cv=num_k_folds)
    mean_score = scores.mean()
    std_dev = scores.std()
    std_error = scores.std() / math.sqrt(scores.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci

    print("95% Confidence Interval for K-Folds = ", num_k_folds, ":")
    print ("Score is %f +/-  %f" % (mean_score, ci))