In [169]:
import sys
import pandas as pd
import pickle
import numpy as np

from sklearn.cluster import KMeans

#make zaCode visible from this notebook
if '..' not in sys.path:
    sys.path.insert(0, '..')
    
from zaCode import DatasetManipulator
from zaCode import ClassifierTrainer
from zaCode import Validator
from zaCode import FileManager

In [170]:
initData = FileManager.get10kTrainingData()

In [171]:
initData = DatasetManipulator.dropMissingValues(initData)
initData = DatasetManipulator.normalizeSize(initData)
initData = DatasetManipulator.performColorCodeEngineering(initData)
initData = DatasetManipulator.performColorCodeEngineering(initData)

trainDataInit, testDataInit = DatasetManipulator.performTrainTestSplit(initData,0.25)

In [172]:
def getCustomerClusteringDataframe(trainData):
    
    medianColumns = ['colorCode','productGroup','deviceID','paymentMethod']
    meanColumns = ['normalisedSizeCode','price','rrp','quantity']
    
    medianData = trainData[medianColumns].groupby(trainData['customerID'])
    meanData = trainData[meanColumns].groupby(trainData['customerID'])
    
    dataMedianByCustomer = medianData.median()
    dataMeanByCustomer = meanData.mean()

    clusteringTrainData = dataMedianByCustomer.join(dataMeanByCustomer)
    
    return clusteringTrainData

In [178]:
def getKnownCustomerIDToPercentageReturnDict(trainData):
    print("Constructing PercentageReturn feature....")

    # avoid chain indexing warning
    trainDataCopy = trainData.copy()
    testDataCopy = testData.copy()

    # construct the dictionary only on the information in the training set
    dataByCustomer = trainDataCopy[['quantity', 'returnQuantity']].groupby(trainDataCopy['customerID'])

    dataSummedByCustomer = dataByCustomer.apply(sum)
    dataSummedByCustomer['percentageReturned'] = dataSummedByCustomer['returnQuantity'] / dataSummedByCustomer[
        'quantity'].apply(lambda x: max(1, x))

    dataSummedByCustomer = dataSummedByCustomer.drop(['returnQuantity', 'quantity'], 1)

    customerIDtoPercentageReturnDict = dataSummedByCustomer.to_dict().get('percentageReturned')
    
    return customerIDtoPercentageReturnDict
    

In [198]:
def getFullCustomerIDToPercentageReturnDict(clusteringTrainData,clusteringTestData,knownCustomerIdToPercentageReturnDict):
   
    n_clusters = 100
    
    #compute the clusters based on the training data
    clusteringTrainDataValues = clusteringTrainData.values
    kMeans = KMeans(n_clusters=n_clusters)
    kMeans.fit(clusteringTrainDataValues)
    labels = kMeans.labels_
    centroids = kMeans.cluster_centers_
    
    #append the cluster index column to the dataframe
    trainDataCopy = clusteringTrainData.copy()
    trainDataCopy.loc[:, 'clusterIndex'] = labels
    trainDataCopy.loc[:, 'percentageReturned'] = trainDataCopy.index.map((lambda custId: knownCustomerIdToPercentageReturnDict[custId]))
    
    
    testDataCopy = clusteringTestData.copy()
    
    clusterLabelToPercentageReturnDict = {}

    #for each cluster, compute it's percentage return average based on the percReturn of the train data
    for i in range(n_clusters):
        customersInCluster = trainDataCopy[trainDataCopy['clusterIndex'] == i]
        average = customersInCluster['percentageReturned'].mean()
        clusterLabelToPercentageReturnDict[i] = average
        
    print(clusterLabelToPercentageReturnDict)
    
    #predict in which cluster the entries in the test data will be
    predictedTestLabels = kMeans.predict(testDataCopy)
    testDataCopy.loc[:, 'clusterIndex'] = predictedTestLabels
    
    #set the percReturn of that entry to the mean of that belonging cluster
    testDataCopy.loc[:, 'percentageReturned'] = testDataCopy['clusterIndex'].apply(lambda clusterIndex: clusterLabelToPercentageReturnDict[clusterIndex])
        
    
    testCustomerIdToPercentageReturnDict = testDataCopy.to_dict().get('percentageReturned')
    
    #merge the 2 dictionaries
    knownCustomerIdToPercentageReturnDict.update(testCustomerIdToPercentageReturnDict)
    
    return knownCustomerIdToPercentageReturnDict

In [199]:
clusteringTrainData = getCustomerClusteringDataframe(trainDataInit)
clusteringTestData = getCustomerClusteringDataframe(testDataInit)

knownCustomerIdToPercentageReturnDict = getKnownCustomerIDToPercentageReturnDict(trainDataInit)

knownCustomerIdToPercentageReturnDict= getFullCustomerIDToPercentageReturnDict(clusteringTrainData,clusteringTestData,knownCustomerIdToPercentageReturnDict)

Constructing PercentageReturn feature....
Known: Unique customers: (2450,)
Known: Dict Length: 2450
{0: 0.4564516129032258, 1: 0.19230769230769232, 2: 0.4940199335548172, 3: 0.5323529411764706, 4: 0.4375, 5: 0.7222222222222222, 6: 0.9, 7: 0.5710144927536233, 8: 0.53125, 9: 0.36974637681159417, 10: 0.5694170771756979, 11: 0.557936507936508, 12: 0.2802721088435374, 13: 0.6666666666666666, 14: 0.47238372093023245, 15: 0.34175734175734174, 16: 1.0, 17: 0.385863697705803, 18: 0.6604238354238355, 19: 0.75, 20: 0.23990310764504316, 21: 0.44098780007870914, 22: 0.5173611111111112, 23: 1.0, 24: 0.48553275920297195, 25: 0.5729476405946996, 26: 0.5102564102564102, 27: 0.7933333333333333, 28: 0.5833333333333334, 29: 0.5438596491228069, 30: 0.5009872241579559, 31: 0.5094202898550725, 32: 0.4, 33: 0.2952380952380952, 34: 0.5222222222222223, 35: 1.0, 36: 0.5361077481840193, 37: 0.5195011337868481, 38: 0.5072649572649574, 39: 0.625, 40: 0.5114942528735632, 41: 0.8333333333333333, 42: 0.5, 43: 0.430769

In [201]:
testDataInit.loc[:, 'percentageReturned'] = testDataInit['customerID'].apply(lambda custId: knownCustomerIdToPercentageReturnDict[custId])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [212]:
print (testDataInit[(testDataInit.percentageReturned > 0.5) & (testDataInit.returnQuantity == 1)].shape)
print (testDataInit[(testDataInit.percentageReturned > 0.5) & (testDataInit.returnQuantity == 0)].shape)

print (testDataInit[(testDataInit.percentageReturned < 0.5) & (testDataInit.returnQuantity == 1)].shape)
print (testDataInit[(testDataInit.percentageReturned < 0.5) & (testDataInit.returnQuantity == 0)].shape)

(741, 16)
(432, 16)
(647, 16)
(663, 16)


In [214]:
testDataInit['percentageReturned'].values

array([ 0.43347545,  0.43347545,  0.64295635, ...,  0.50726496,
        0.32362698,  0.69444444])