# Train with the Test split?
This is what we want to do:

1. predict batches with ...
.TODO: create list of content

### 1.) set run-config and hyperparameters

In [1]:
FINAL_SUBMISSION = False # will perform a test on a validation split if set to False

TEST_BATCH_SIZE = 10000 # Number of Test entries to add to the training set for the next iteration

### 2.) import python modules

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

### 3.) define helper functions

In [3]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = (metrics.confusion_matrix(y_test, y_pred)).tolist()
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return accuracy, dmc_score, confusion_matrix

def find_nearest_neighbor_index(row, dataset):
    scaler = MinMaxScaler()
    dataset_scaled = scaler.fit_transform(dataset)
    row_scaled = scaler.transform([row])
    diffs = [np.sum((row_scaled[0] - ds_row)**2) for ds_row in dataset_scaled]
    return np.argmin(diffs)

def get_classifier(name):
    return {
        'xgb': XGBClassifier(),
        'svc': LinearSVC(C=0.8669055747631755, class_weight=None, dual=False,
                 fit_intercept=True, intercept_scaling=1.1311617930050963,
                 loss='squared_hinge', max_iter=20000, multi_class='ovr', penalty='l2',
                 random_state=None, tol=0.0039333067038518875, verbose=0)
    }[name]


### 4.) import datasets

In [4]:
#convention for variables names: datasetname_columntype_transformstatus_dataframeornot
train_Xy_original_df = pd.read_csv("../data/train.csv", sep="|") if FINAL_SUBMISSION else pd.read_csv("../data/train_new.csv", sep="|")
test_X_original_df   = pd.read_csv("../data/test.csv", sep="|") #.iloc[0:301] #TODO: For faster testing we use less data from the test set
train_y_original_df = train_Xy_original_df[["fraud"]].copy()
train_X_original_df = train_Xy_original_df.copy().drop("fraud", axis=1)

### 5.) prepare input data

In [5]:
class dataTransformer:
    """
    for scaling, data transformations (new features, one-hot encoding, categorical, ...)
    """
    scaler = MinMaxScaler()

    def fitScaler(self, df):
        self.scaler.fit(df.astype(np.float64))
        return self
        
    def applyScale(self, df):
        return pd.DataFrame(self.scaler.transform(df), df.index, df.columns)
    
    def inverseScale(self, df):
        return pd.DataFrame(self.scaler.inverse_transform(df), df.index, df.columns)
    
    def addFeatures(self,df):
        #TODO: Choose relevant features
        df['totalScannedLineItems'] = df['scannedLineItemsPerSecond'] * df['totalScanTimeInSeconds']
        #df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
        #df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
        #df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
        #df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
        #df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
        #df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
        #df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
        #df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
        #df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
        #df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
        #df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
        return df
    
    def transform(self, df):
        """
        All in one: Apply all transform methods
            1.) addFeatures
            2.) applyScale
        """
        return self.applyScale(self.addFeatures(df))
    

transformer = dataTransformer()
transformer.fitScaler(transformer.addFeatures(train_X_original_df.append(test_X_original_df, sort=False)))
train_X_transformed_df = transformer.transform(train_X_original_df.copy())
test_X_transformed_df  = transformer.transform(test_X_original_df.copy())

test_X_transformed_df.head(2)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
0,0.6,0.254645,0.884888,0.363636,0.8,0.8,0.000481,0.0019,0.051948,0.206897
1,0.4,0.548087,0.589959,0.636364,0.6,0.2,0.000878,0.000589,0.023569,0.896552


### 6.) iterative model training using pseudo-labeling
predict batches of the test set, add them to the previous training set and use this new training set to predict the next batch.

In [6]:
def get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df):
    
    #TODO: Use KNN to get best classifier
    
    # train a classificator on the pseudo labeled train (pltrain) dataset
    clf = get_classifier('svc')
    clf.fit(pltrain_X_transformed_df, pltrain_y_original_df.fraud)
    
    # predict labels for batch
    testbatch_y_original = clf.predict(testbatch_X_transformed_df)
    testbatch_Xy_transandorig_df = testbatch_X_transformed_df.assign(fraud = testbatch_y_original)
    
    # add batch to pseudo labeled train (pltrain) dataset. needs to ignore index as ids in test also start with 0
    pltrainnew_X_transformed_df = pltrain_X_transformed_df.append(testbatch_X_transformed_df, ignore_index=True)
    pltrainnew_y_original_df = pltrain_y_original_df.append(testbatch_Xy_transandorig_df[['fraud']], ignore_index=True)
    return pltrainnew_X_transformed_df, pltrainnew_y_original_df
    
    

print("total test size:",len(test_X_transformed_df),", with a batchsize of",TEST_BATCH_SIZE," we will need",int(np.ceil(len(test_X_transformed_df)/TEST_BATCH_SIZE)),"iterations:")

#initialize pseudo labeled train (pltrain) dataset with the transformed training data
pltrain_X_transformed_df = train_X_transformed_df.copy()
pltrain_y_original_df = train_y_original_df.copy()


# iterate through fixed-size batches
for i in range(TEST_BATCH_SIZE, len(test_X_transformed_df), TEST_BATCH_SIZE):
    print("iteration",int(i/TEST_BATCH_SIZE),"\twith batch from",i-TEST_BATCH_SIZE,"\t to", i,", training with",len(pltrain_y_original_df),"samples")
    # get batch from test set
    testbatch_X_transformed_df = test_X_transformed_df.iloc[i-TEST_BATCH_SIZE:i]
    
    # extend pseudo labeled train (pltrain) dataset by predicting the batch
    pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df)
    
    #if i>len(df_test_X_transformed)-1000:
    #    print(i)
    #    display(df_test_X_transformed_batch.head(1))

# use last few rows that cant fill up a complete batch as a smaller batch
print("iteration",int(i/TEST_BATCH_SIZE)+1,"\twith batch from",i,"\t to", len(test_X_transformed_df),", training with",len(pltrain_y_original_df),"samples")
testbatch_X_transformed_df = test_X_transformed_df.iloc[i:len(test_X_transformed_df)]

# extend pseudo labeled train (pltrain) dataset by predicting the small batch
pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df)

#combine x and y columns dataframes to one big dataframe
pltrain_Xy_transandorig_df = pltrain_X_transformed_df.assign(fraud = pltrain_y_original_df.fraud.values)

print("training with pseudo labeling completed, last iteration used",len(pltrain_Xy_transandorig_df),"samples.")

display(pltrain_Xy_transandorig_df.head(1))

total test size: 498121 , with a batchsize of 10000  we will need 50 iterations:
iteration 1 	with batch from 0 	 to 10000 , training with 1503 samples
iteration 2 	with batch from 10000 	 to 20000 , training with 11503 samples
iteration 3 	with batch from 20000 	 to 30000 , training with 21503 samples
iteration 4 	with batch from 30000 	 to 40000 , training with 31503 samples
iteration 5 	with batch from 40000 	 to 50000 , training with 41503 samples
iteration 6 	with batch from 50000 	 to 60000 , training with 51503 samples
iteration 7 	with batch from 60000 	 to 70000 , training with 61503 samples
iteration 8 	with batch from 70000 	 to 80000 , training with 71503 samples
iteration 9 	with batch from 80000 	 to 90000 , training with 81503 samples
iteration 10 	with batch from 90000 	 to 100000 , training with 91503 samples
iteration 11 	with batch from 100000 	 to 110000 , training with 101503 samples
iteration 12 	with batch from 110000 	 to 120000 , training with 111503 samples
it

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems,fraud
0,0.6,0.451913,0.665667,0.636364,0.4,0.6,0.000223,0.000806,0.106061,0.172414,0


### 7.) pseudo-label the test set and create new classifier based on this
first we predict the original test data labels using the new extended pltrain from above cell and second we use this test data labels to train a new classifier

In [7]:
# train a classificator on the pseudo labeled train (pltrain) dataset
pltrain_clf = get_classifier('svc')
pltrain_clf.fit(pltrain_X_transformed_df, pltrain_y_original_df.fraud)

# predict labels for (transformed) original test set
pltest_y_original = pltrain_clf.predict(test_X_transformed_df)

# combine x and y columns dataframes to one big dataframe
pltest_Xy_transandorig_df = test_X_transformed_df.assign(fraud = pltest_y_original)
display(pltest_Xy_transandorig_df.head(1))

# train a new classifier based on pltest
pltest_clf = get_classifier('svc')
pltest_clf.fit(test_X_transformed_df, pltest_y_original);

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems,fraud
0,0.6,0.254645,0.884888,0.363636,0.8,0.8,0.000481,0.0019,0.051948,0.206897,0


### 8.) evaluate our new classifier with the original training set

In [8]:
trainpred_y_original = pltest_clf.predict(train_X_transformed_df)
calc_scores(train_y_original_df.fraud.values, trainpred_y_original)

(0.9906852960745176, 220, [[1419, 3], [11, 70]])

### 9.) combine the pseudo labeled test set with the original train data to train our final classifier

In [9]:
#--> already done in step 7
final_clf = pltrain_clf

### 10.) predict labels for the test set using our final classifier

In [10]:
#-->already done in step 7
test_y_pred = pltest_y_original

### 11.) generate output file neeeded for submission

In [11]:
pd.DataFrame(test_y_pred, columns=["fraud"]).to_csv("HS_Karlsruhe_1.csv", index=False)
pd.read_csv("HS_Karlsruhe_1.csv").head(5)

Unnamed: 0,fraud
0,0
1,0
2,0
3,0
4,0


### 12.) evaluate our new classifier with the validation set
Now at the very end we can also test our final model on a validation split never used before. just for comparison. 

**For the final submission, the following code should will not be run and the full train (incl. this val split) set will be used above**

In [12]:
if not FINAL_SUBMISSION:
    val_Xy_original_df = pd.read_csv("../data/val_new.csv", sep="|")
    valpred_y_original = final_clf.predict(transformer.transform(val_Xy_original_df.drop("fraud", axis=1)))
    print(calc_scores(val_Xy_original_df.fraud.values, valpred_y_original))

(0.9946808510638298, 95, [[353, 0], [2, 21]])
