Preliminary class construction for the Assemble.AdaBoost model, based on Exploiting Unlabeled Data in Ensemble Methods, Bennett et al.

In [51]:
from sklearn import preprocessing as pp
from sklearn.model_selection import train_test_split as ttsplit
import math as m
import pandas as pd
import numpy as np

In [151]:
data = pd.read_csv('mle_fraud_test.csv', sep=';', index_col=0)
data = data.iloc[:100]
data

Unnamed: 0_level_0,user_id,order_created_datetime,amount,total_amount_14days,email_handle_length,email_handle_dst_char,total_nb_orders_player,player_seniority,total_nb_play_sessions,geographic_distance_risk,transaction_status
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,499120842,15/01/2019 00:00,23.754760,0.000000,11,10,3,2000,84,0,LEGIT
2,425189022,15/01/2019 00:00,17.903987,20.805420,13,9,29,1035,40,0,LEGIT
3,975417854,15/01/2019 00:00,16.450540,17.887367,13,11,4,780,12,0,LEGIT
4,1836800872,15/01/2019 00:00,16.450540,0.000000,13,10,4,444,66,0,LEGIT
5,3411980586,15/01/2019 00:00,16.530277,17.892307,5,5,46,145,95,0,LEGIT
...,...,...,...,...,...,...,...,...,...,...,...
96,591348478,15/01/2019 00:07,17.903987,17.909567,10,8,4,2203,242,0,LEGIT
97,3821161912,15/01/2019 00:07,16.602007,0.000000,10,7,0,56,10,0,LEGIT
98,2682042314,15/01/2019 00:07,17.903987,39.745480,10,8,92,384,30,0,LEGIT
99,3772603188,15/01/2019 00:08,16.530277,0.000000,11,8,3,77,14,0,LEGIT


In [168]:
class fraudDetectionInputData():
    
    def __init__(self):
            self.labelledTrainingData = pd.DataFrame()
            self.unlabbeledTrainingData = pd.DataFrame()
    
            self.labelledTestData = pd.DataFrame()
            self.unlabelledTestData = pd.DataFrame()
    
            self.trainingLabels = pd.Series()
            self.testLabels = pd.Series()
            
            self.isLabelled = pd.Series()
    
            self.distanceMatrix = [[]]
            self.norm = 'l1'
        
    
    def importData(self, data, test_rate):
        
        tmp = data.copy()
        tmp = self.fraudDetectedToInt(tmp)
        tmp.drop(['user_id', 'order_created_datetime'], axis='columns', inplace=True)
        
        
        cols = tmp.columns

        self.labels = tmp['transaction_status']
        
        #Assumes uniform distribution of unlabelled data in the dataset,
        #could be refined with rearranging unlabelled data in training/test sets.
        
        XTrain, XTest, YTrain, YTest = ttsplit(tmp[[col for col in cols if col != 'transaction_status']],
                                               self.labels,
                                               test_size=test_rate)
        
        self.labelledTrainingData   = XTrain[YTrain != -1]
        
        #print(labelledTrainingData)
        
        self.unlabelledTrainingData = XTrain[YTrain == -1]
        
        self.labelledTestData = XTest[YTest != -1]
        self.unlabelledTestData = XTest[YTest == -1]
        
        self.isLabelled = YTrain != -1

        
    def fraudDetectedToInt(self, data):
        tmp = data
        tmp['transaction_status'] = data['transaction_status'].apply(lambda s : 1 if s == 'FRAUD'else
                                                                               (0 if s == 'LEGIT' else
                                                                                -1))
        return tmp
    
    def normalizeData(self, labelled, unlabelled):
        
        cols = labelled.columns
       
        tmp = pd.concat([labelled, unlabelled])
        tmp_norm = pp.normalize(tmp.values, self.norm, axis=0)
        tmp = pd.DataFrame(data = tmp_norm, columns = cols, index = tmp.index)
        
        labelledNorm = tmp.loc[labelled.index]
        unlabelledNorm = tmp.loc[unlabelled.index]
        
        return labelledNorm, unlabelledNorm
    
    def buildDistanceMatrix(self):
        l = len(self.labelledTrainingData)
        u = len(self.unlabelledTestData)
        
        self.distanceMatrix = [[0 for _ in range(l)] for __ in range(u)]
        
        for i in range(u):
            for j in range(l):
                self.distanceMatrix[i][j] =  np.absolute(self.labelledTrainingData.iloc[j] - self.unlabelledTestData.iloc[i]).sum()
                
    def buildPseudoClasses(self):
        l = len(self.distanceMatrix[0])
        u = len(self.distanceMatrix)
        
        for i in range(u):
            self.trainingLabels[i + l] = self.trainingLabels[np.argmin(self.distanceMatrix[i])]

In [169]:
fdid = fraudDetectionInputData()

fdid.importData(data, 0.2)
fdid.buildDistanceMatrix()

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [170]:
fdid.isLabelled

order_id
25     True
54     True
43     True
62     True
75    False
      ...  
71    False
66     True
57     True
50     True
85     True
Name: transaction_status, Length: 80, dtype: bool

In [None]:
class assembleAdaboost():

    self.alpha = 1
    self.beta = 0.5
    self.lab = 1
    self.ulab = 1
    
    self.pretrainedClassifiers = []
    self.newClassifiers = sklearn.model
    self.currentClassifier = sklearn.model
    self.missclassificationCost = [0 for _ in range(self.lab + self.ulab)]
    self.weigths = []
    
    
    def __init__(a, b, l, u):
        self.alpha = a
        self.beta = b
        self.lab = l
        self.ulab = u
        
        
    def initMissclassificationCost(self, isLabelled):
        if len(labels) != self.lab + self.ulab:
            print("to be taken care of")
            
        for i in range(len(labels)):
            self.missclassificationCost = [self.beta/l, (1 - self.beta)/u][isLabelled[i] == 0]
            
    def sampleFromDistribution(self):
        return np.random.choice(np.arrange(0, self.lab + self.ulab - 1),
                                self.lab,
                                p = self.missclassificationCost)
        
    def addModel(model):
        models.append(model)
        
    