In [31]:
%load_ext autoreload
%autoreload 2

In [17]:
%%file forall.py

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


error_messages = {
    "No clear target in training data": 
        ("The training data must have " 
         "exactly one more column than " 
         "the test data."),
    "Training data has too many columns":
        ("The training data has more "
         "than one column different than "
         "the testing data: %s")
}

def X_y_split(X_train, X_test):
    """
    Determines which variables are the target
    and which are the features. Returns just
    The X and y data in the training dataset
    as a tuple.
    
    Example usage:
    X, y = learn.X_y_split(X_train, X_test)
    
    Parameters
    ----------
    X_train: pandas dataframe
        The data that has the target in it.
    
    X_test: pandas dataframe
        The data that does not have the target in it.
    """
    n_train_cols = X_train.shape[1]
    n_test_cols = X_test.shape[1]
    
    if n_train_cols != n_test_cols + 1:
        msg = error_messages["No clear target in training data"]
        raise ValueError(msg)
        
    test_columns = set(X_test.columns)
    train_columns = set(X_test.columns)
    target_columns = train_columns - test_columns
    if len(target_columns) > 1:
        key = "Training data has too many columns"
        msg_ = error_messages[key]
        msg = msg_ % str(target_columns)
        raise ValueError(msg)

    extra_columns_in_test = test_columns - train_columns


Overwriting forall.py


In [8]:
%%file utils.py

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split


def make_regression_data(source="boston", 
                         missing_data=None, 
                         categorical=None, 
                         outliers=None):
    """
    Utility function to assist in loading different 
    sample datasets. Returns training data (that 
    contains the target) and testing data (that
    does not contain the target).
    
    Parameters
    ----------
    source: string, optional (default="boston")
        The specific dataset to load. Options:
        - "boston": Boston housing dataset
        
    missing_data: bool or NoneType (default=None)
        To be implemented
        Determines if there is missing data
        
    categorical: bool or NoneType (default=None)
        To be implemented
        Determines if there is categorical data
        
    outliers: bool or NoneType (default=None)
        To be implemented
        Determines if there are outliers in the dataset
    """
    boston_data = load_boston()
    X = pd.DataFrame(data=boston_data.data, 
                     columns=boston_data.feature_names)
    y = pd.Series(data=boston_data.target)
    X_train, X_test, y_train, _ = train_test_split(X, 
                                                   y, 
                                                   test_size=.5,
                                                   random_state=42)
    X_train["target"] = y_train
    return X_train, X_test
    

Overwriting utils.py


In [19]:
%%file ../tests/model_tests.py
import unittest
from learn import utils

class TestUtils(unittest.TestCase):
    def test_making_regression_data_simple(self):
        X_train, X_test = utils.make_regression_data()
        train_cols = X_train.columns
        test_cols = X_test.columns
        self.assertEquals(len(train_cols), len(test_cols)+1)

class TestXYSplit(unittest.TestCase):
    pass

if __name__ == '__main__':
    unittest.main()

Overwriting ../tests/model_tests.py


In [20]:
!python -m unittest


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


In [8]:
boston_data = load_boston()

X = pd.DataFrame(data=boston_data.data, columns=boston_data.feature_names)
y = pd.Series(data=boston_data.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)

In [35]:
import learn

In [36]:
learn.X_y_split(X_train=X_train, X_test=X_test)

ValueError: The training data must have exactly one more column than the test data

In [22]:
model = RandomForestRegressor(n_estimators=100, oob_score=True)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [24]:
y_oob = model.oob_prediction_
y_hat = model.predict(X_test)