In [89]:
import sys
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import Imputer,StandardScaler,PolynomialFeatures


In [90]:
# inputFileName = '..\\data\\orders_train_short.txt'
inputFileName = '..\\data\\orders_train_10k.txt'
data = pd.read_csv(inputFileName, delimiter=';', skipinitialspace=True)


In [91]:
keptColumns = ['colorCode', 'quantity', 'price', 'rrp','deviceID','paymentMethod','voucherAmount' ]
data = data[keptColumns].copy()
data = data.dropna()

In [92]:

def constructPolynomialFeatures(data):

    #get only the target columns

    # features = ['quantity', 'price', 'rrp','sizeCode','voucherAmount','basketQuantity','percentageReturned','itemPercentageReturned']

    features = ['quantity', 'price', 'rrp']

    targetData = data[features].copy()

    #standardize everything
    dataMatrix = targetData.as_matrix().astype(np.float)
    scaler = StandardScaler()
    dataMatrix = scaler.fit_transform(dataMatrix)

    #construct polynomial features
    polynomialFeatures = PolynomialFeatures(interaction_only=True,include_bias=False)
    newColumnsMatrix = polynomialFeatures.fit_transform(dataMatrix)

    newColumnsNames = []
    
    #construct the names of the newly generated features as we only have a matrix of numbers now
    for entry in polynomialFeatures.powers_:
        newFeature = []
        for feat, coef in zip(features, entry):
            if coef > 0:
                newFeature.append(feat + '^' + str(coef))
        if not newFeature:
            newColumnsNames.append("1")
        else:
            newColumnsNames.append(' + '.join(newFeature))


    newColumnsDataFrame = pd.DataFrame(newColumnsMatrix,columns=newColumnsNames)
    
    #drop all the features which are themselves to the power 1  ( as they already exist )
    newColumnsToBeDeleted = [featureName+"^1" for featureName in features]
    newColumnsDataFrame = newColumnsDataFrame.drop(newColumnsToBeDeleted, 1)
    
    data = data.join(newColumnsDataFrame)
    
    return data


In [94]:
enhancedData = constructPolynomialFeatures(data)

In [95]:
enhancedData

Unnamed: 0,colorCode,quantity,price,rrp,deviceID,paymentMethod,voucherAmount,quantity^1 + price^1,quantity^1 + rrp^1,price^1 + rrp^1
0,1972,1,10.00,29.99,2,BPRG,0.0,0.028195,0.014185,0.666057
1,3854,1,20.00,39.99,2,BPRG,0.0,0.018804,0.006309,0.197576
2,2974,1,35.00,49.99,4,BPRG,0.0,0.004718,-0.001567,-0.012309
3,1992,1,49.99,49.99,4,BPRG,0.0,-0.009359,-0.001567,0.024419
4,1968,1,10.00,35.99,2,PAYPALVC,0.0,0.028195,0.009460,0.444171
5,1972,1,10.00,35.99,2,PAYPALVC,0.0,0.028195,0.009460,0.444171
6,1001,1,25.00,39.99,2,PAYPALVC,0.0,0.014108,0.006309,0.148240
7,3976,1,15.00,39.99,2,PAYPALVC,0.0,0.023499,0.006309,0.246911
8,2493,0,0.00,59.99,2,PAYPALVC,0.0,8.984711,-2.257286,-0.591048
9,1001,1,89.99,89.99,3,BPRG,0.0,-0.046922,-0.033071,2.584226


In [None]:
enhancedData.head