In [1]:
%pylab inline

import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [2]:
train_set = pd.read_csv('data/competition_data/train_set.csv')
train_set['quote_date'] = pd.to_datetime(train_set['quote_date'])
train_set[:5]

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561


In [40]:
class SupplierFeaturizer(object):
    """
    Make binary features for the most common suppliers.
    """
    def __init__(self, min_count=10):
        self.min_count = min_count
        self.feature_names = None
        self.supplier_to_feature_index = None
        
    def build(self, train_set):
        # Make a feature for every supplier that occurs more than min_count times in the train_set.
        counts = train_set['supplier'].value_counts()
        counts = counts[counts >= self.min_count]
        self.feature_names = []
        self.supplier_to_feature_index = {}
        for supplier in list(counts.index):
            self.supplier_to_feature_index[supplier] = len(self.feature_names)
            self.feature_names.append('supplier-{}'.format(supplier))
    
    def get_features(self, data_set):
        feats = np.zeros((len(data_set), len(self.feature_names)), dtype=bool)
        for i, supplier in enumerate(data_set['supplier']):
            if supplier in self.supplier_to_feature_index:
                feats[i][self.supplier_to_feature_index[supplier]] = True
        return pd.DataFrame(feats, index=data_set.index, columns=self.feature_names)

In [41]:
featurizer = SupplierFeaturizer()
featurizer.build(train_set)
feats = featurizer.get_features(train_set)
feats[:5]

Unnamed: 0,supplier-S-0066,supplier-S-0041,supplier-S-0072,supplier-S-0054,supplier-S-0026,supplier-S-0013,supplier-S-0058,supplier-S-0064,supplier-S-0062,supplier-S-0014,...,supplier-S-0105,supplier-S-0005,supplier-S-0031,supplier-S-0027,supplier-S-0042,supplier-S-0070,supplier-S-0043,supplier-S-0018,supplier-S-0080,supplier-S-0092
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [60]:
# A model doing ridge regression with an arbitrary featurizer.

from sklearn import linear_model

class RidgeRegressionLearner(object):
    def __init__(self, featurizer, alpha, label_col):
        self.featurizer = featurizer
        self.alpha = alpha
        self.label_col = label_col
        self.reset()
    
    def reset(self):
        self.model = None
        
    def learn(self, train_set):
        self.featurizer.build(train_set)
        train_feats = self.featurizer.get_features(train_set)
        X = train_feats.values
        # No-transform version:
        # y = train_set[self.label_col].values
        # Log-transform version:
        y = np.log(train_set[self.label_col].values + 1)
        self.model = linear_model.Ridge(alpha=self.alpha)
        self.model.fit(X, y)
    
    def predict(self, test_set):
        test_feats = self.featurizer.get_features(test_set)
        X = test_feats.values
        # No-transform version:
        # y = self.model.predict(X)
        # Log-transform version:
        y = np.exp(self.model.predict(X)) - 1
        return y

In [61]:
featurizer = SupplierFeaturizer()
learner = RidgeRegressionLearner(featurizer, alpha=0.5, label_col='cost')
learner.learn(train_set)

In [62]:
learner.model.coef_, learner.model.intercept_

(array([-0.42208769, -0.39341496, -0.50098472, -0.2977123 ,  0.76855134,
        -0.62176915, -0.19976114,  0.43048992, -0.31998538,  0.03002705,
        -0.75837908, -0.09586676, -0.53797469,  0.44813174, -0.64952044,
        -0.39518215, -1.21689243, -0.76774562, -0.33987412, -0.09064254,
        -1.3581313 ,  0.57120965,  0.37612342]), 2.57636247706944)

In [63]:
learner.predict(train_set)

array([  7.62163542,   7.62163542,   7.62163542, ...,   7.87241938,
         8.54843343,  27.35813155])

In [64]:
from common import calc_loss
from common import cross_validation_eval

learner = RidgeRegressionLearner(featurizer, alpha=0.5, label_col='cost')
losses = cross_validation_eval(learner, train_set, 'cost')
print "{}: loss avg {} std {}".format(
    learner, np.mean(losses), np.std(losses))

<__main__.RidgeRegressionLearner object at 0x51f8590>: loss avg 0.793266622048 std 0.0106469131208
