# Train with the Test split?
This is what we want to do:

1. predict batches with ...
.TODO: create list of content

### 1.) set run-config and hyperparameters

In [1]:
FINAL_SUBMISSION = False # will perform a test on a validation split if set to False

TEST_BATCH_SIZE = 200 # Number of Test entries to add to the training set for the next iteration
ITER_PRINT_EVERY = 50 # Which Iterations to print (every nth)

### 2.) import python modules

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
from sklearn.model_selection import train_test_split

### 3.) define helper functions

In [3]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = (metrics.confusion_matrix(y_test, y_pred)).tolist()
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return accuracy, dmc_score, confusion_matrix

def find_nearest_neighbor(row_scaled, dataset_scaled):  
    diffs = [np.sum((row_scaled[0] - ds_row)**2) for ds_row in dataset_scaled]
    idx = np.argmin(diffs)[0]
    return idx, diffs[idx]

def get_classifier(name):
    return {
        'xgb': XGBClassifier(),
        'svc': LinearSVC(C=0.8669055747631755, class_weight=None, dual=False,
                 fit_intercept=True, intercept_scaling=1.1311617930050963,
                 loss='squared_hinge', max_iter=20000, multi_class='ovr', penalty='l2',
                 random_state=None, tol=0.0039333067038518875, verbose=0)
    }[name]

def name_best_classifier_for_sample(idx, validation_set):
    ground_truth = validation_set.iloc[idx].fraud
    
    # Both classifier predicted the calue correctly
    if (validation_set.iloc[idx].lsvc_predict == ground_truth) and (validation_set.iloc[idx].xgb_predict == ground_truth):
        if validation_set.iloc[idx].lsvc_proba > validation_set.iloc[idx].xgb_proba:
            return "lsvc"
        else:
            return "xgboost"
    # lsvc predicted correctly
    elif (validation_set.iloc[idx].lsvc_predict == ground_truth) and (validation_set.iloc[idx].xgb_predict != ground_truth):
        return "lsvc"
    
    # xgboost predicted correcltly
    elif (validation_set.iloc[idx].lsvc_predict != ground_truth) and (validation_set.iloc[idx].xgb_predict == ground_truth):
        return "xgboost"
    
    # If No classifier predicted the knn correct, None is returned
    else: 
        return None

### 4.) Import Data

In [13]:
trainandknn_Xy_original_df = pd.read_csv("../data/train.csv", sep="|") if FINAL_SUBMISSION else pd.read_csv("../data/train_new.csv", sep="|")
train_Xy_original_df, knn_Xy_original_df = train_test_split(trainandknn_Xy_original_df,train_size=0.75) # if FINAL_SUBMISSION else 0.8**2) #small
test_X_original_df  = pd.read_csv("../data/test.csv", sep="|") #.iloc[0:301] #TODO: For faster testing we use less data from the test set

#Only for test routines
val_Xy_original_df = pd.read_csv("../data/val_new.csv", sep="|")
train_Xy_complete_original_df = pd.read_csv("../data/train.csv", sep="|")



### 5.) Prepare Input X and Label Y Data

In [12]:
#convention for variables names: datasetname_columntype_transformstatus_dataframeornot
train_y_original_df = train_Xy_original_df[["fraud"]].copy()
train_X_original_df = train_Xy_original_df.copy().drop("fraud", axis=1)

knn_y_original_df = knn_Xy_original_df[["fraud"]].copy()
knn_X_original_df = knn_Xy_original_df.copy().drop("fraud", axis=1)

# Only for test routie#nes
val_y_originial_df = val_Xy_original_df[["fraud"]].copy()
val_X_originial_df = val_Xy_original_df.copy().drop("fraud", axis=1)

train_y_complete_complete = train_Xy_complete_original_df[["fraud"]].copy()
train_X_complete_complete = train_Xy_complete_original_df.copy().drop("fraud", axis=1)

### 6.) DataTransformer Class and data transformation

In [9]:
class DataTransformer:
    """
    for scaling, data transformations (new features, one-hot encoding, categorical, ...)
    """
    
    def __init__(self, scaler):
        self.scaler = scaler

    def fit_scaler(self, df):
        df_tmp = df.copy()
        self.scaler.fit(df_tmp.astype(np.float64))
        return self
        
    def apply_scaler(self, df):
        return pd.DataFrame(self.scaler.transform(df), df.index, df.columns)
    
    def inverse_scale(self, df):
        df_tmp = df.copy()
        return pd.DataFrame(self.scaler.inverse_transform(df_tmp), df_tmp.index, df_tmp.columns)
    
    def add_features(self,df):
        #TODO: Choose relevant features
        df_tmp = df.copy()
        df_tmp['totalScannedLineItems'] = df_tmp['scannedLineItemsPerSecond'] * df_tmp['totalScanTimeInSeconds']
        #df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
        #df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
        #df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
        #df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
        #df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
        #df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
        #df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
        #df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
        #df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
        #df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
        #df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
        return df_tmp
    
    def transform(self, df):
        """
        All in one: Apply all transform methods
            1.) addFeatures
            2.) apply_scaler
        """
        df_tmp = df.copy()
        return self.apply_scaler(self.add_features(df_tmp))
    


Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
0,0.6,0.254645,0.884888,0.363636,0.8,0.8,0.000481,0.0019,0.051948,0.206897
1,0.4,0.548087,0.589959,0.636364,0.6,0.2,0.000878,0.000589,0.023569,0.896552


In [20]:
scaler = MinMaxScaler()
transformer = DataTransformer(scaler)

# Adding new Features to train and test set
train_X_unscaled_df = transformer.add_features(train_X_original_df)
test_X_unscaled_df = transformer.add_features(test_X_original_df)
knn_X_unscaled_df = transformer.add_features(knn_X_original_df)

val_X_unscaled_df = transformer.add_features(val_X_originial_df)
train_X_complete_unscaled_df = transformer.add_features(train_X_original_df) 

transformer.fit_scaler(transformer.add_features(train_X_complete_unscaled.append(test_X_unscaled_df, sort=False)))
train_X_scaled_df = transformer.apply_scaler(train_X_unscaled_df)
knn_X_scaled_df   = transformer.apply_scaler(knn_X_unscaled_df)


test_X_scaled_df  = transformer.apply_scaler(test_X_unscaled_df)
val_X_scaled_df = transformer.apply_scaler(val_X_unscaled_df)
train_X_complete_scaled_df = transformer.apply_scaler(train_X_complete_unscaled_df)

test_X_scaled_df.head(2)

Unnamed: 0,trustLevel,totalScanTimeInSeconds,grandTotal,lineItemVoids,scansWithoutRegistration,quantityModifications,scannedLineItemsPerSecond,valuePerSecond,lineItemVoidsPerPosition,totalScannedLineItems
0,0.6,0.254645,0.884888,0.363636,0.8,0.8,0.000481,0.0019,0.051948,0.206897
1,0.4,0.548087,0.589959,0.636364,0.6,0.2,0.000878,0.000589,0.023569,0.896552


| Name     | New Features | Scaled |
|----------|--------------|--------|
| orig     |      [ ]     |   [ ]  |
| unscaled |      [X]     |   [ ]  |
| scaled   |      [X]     |   [X]  |

### 5 1/2.) train normally with all available classifiers for classifying knn split

In [6]:
knnwithprobs_Xy_df = knn_X_scaled_df.copy()
#TODO: save predict_proba to knnwithprobs_Xy_original_df

In [7]:
def test_routine(lsvc_classifier, xgboost_classifier, data_dict ,data_transformer, step):
    
    #Predict on ValidationSet
    lsvc_val_acc, lsvc_val_dmc, lsvc_val_conf_mat = calc_scores(data_dict['val_y'],lsvc_pred_val)
    xgb_val_acc, xgb_val_dmc, xgb_val_conf_mat = calc_scores(data_dict['val_y'],lsvc_pred_val)
    
    lsvc_pred_val = fitted_lsvc_classifier.predict(data_dict['val_X_transformed'])
    xgb_pred_val = fitted_xgboost_classifier.predict(data_dict['val_X'])
    
    #Predict on original full size (~1900 samples) just trained on test_data    
    lsvc_pred_train = data_dict['train_X_transformed']
    xgb_pred_train = data_dict['train_X']
    lsvc_train_acc, lsvc_train_dmc, lsvc_train_conf_mat = calc_scores(data_dict['val_y'],lsvc_pred_val)
    xgb_train_acc, xgb_train_dmc, xgb_train_conf_mat = calc_scores(data_dict['val_y'],lsvc_pred_val)
    
    results = {"lin_svc":{
                    "val": {
                        "dmc_score": lsvc_val_dmc,
                        "conf_matrix": lsvc_val_conf_mat
                    },
                    "train": {
                        "dmc_score": lsvc_train_dmc,
                        "conf_matrix": lsvc_train_conf_mat
                    }
                },
                "xgboost": {
                    "val": {
                        "dmc_score": xgb_val_dmc,
                        "conf_matrix": xgb_val_conf_mat
                    },
                    "train": {
                        "dmc_score": xgb_train_dmc,
                        "conf_matrix": xgb_train_conf_mat
                    }
                }
            }
    
    return results

In [24]:
def classify(xgboost, linear_svc, data_to_predict, data_knn_with_probs, transformer):
    prediction = []
    data_knn_transformed = transformer.apply_scaler(data_knn.copy().drop(columns=["xgb_predict", "xgb_proba", "lsvc_predict", "lsvc_proba"])) 
    data_to_predict_transformed = transformer.apply_scaler(data_to_predict,copy())
    data_to_predict_original = data_to_predict.copy()

    for index, row in data_to_predict_orig.iterrows():
        if sample.trustLevel >= 3:
            prediction.append(0)            
    
        else:             
            idx_knn, distance_knn = find_nearest_neighbor(data_to_predict_transformed.iloc[index], data_knn_transformed)
            # If distance to knn is to big, classify them directly
            if distance > 0.15:
                xgb_pred = xgboost_classifier.predict([data_to_predict_original.iloc[index]])[0]
                xgb_prob = max(np.ravel(xgboost_classifier.predict_proba([data_to_predict_original.iloc[index]])))

                lsvc_pred = lsvc_classifier.predict([data_to_predict_transformed.iloc[index]])[0]
                lsvc_prob = max(np.ravel(lsvc_classifier.predict_proba([data_to_predict_transformed.iloc[index]])))
                # If both classified them equal, take one of both
                if xgb_pred == lsvc_pred:
                    prediction.append(xgb_pred)

                #if classification is not equal, take the one with higher probability
                elif xgb_prob > lsvc_prob:
                    prediction.append(xgb_pred)
                else: 
                    prediction.append(lsvc_pred)
                    
            # If distance is smaller than 0.15, use knn    
            else:    
                best_classifier = best_classifier_for_sample(idx, data_knn_with_probs)
                if best_classifier == "xgboost":
                    prediction.append(xgboost_classifier.predict([data_to_predict_original.iloc[index]])[0])

                elif best_classifier == "lsvc":
                    prediction.append(lsvc_classifier.predict([data_to_predict_transformed.iloc[index]])[0])

                elif best_classifier is None:
                    return None
                
    return prediction

In [None]:
def semi_supervised_learning_procedure(test_X):
    for i in range(TEST_BATCH_SIZE, len(test_X_transformed), TEST_BATCH_SIZE):
        if int(i/TEST_BATCH_SIZE) % ITER_PRINT_EVERY == 0:
            print("iteration",int(i/TEST_BATCH_SIZE),"\t/",int(np.ceil(len(test_X_transformed)/TEST_BATCH_SIZE)),"with batch from",i-TEST_BATCH_SIZE,"\t to", i,", training with",len(pltrain_y_original_df),"samples")
        # get batch from test set
        testbatch_X_df = test_X.iloc[i-TEST_BATCH_SIZE:i]
        
        
        # extend pseudo labeled train (pltrain) dataset by predicting the batch
        pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_df, pltrain_X_df, pltrain_y_original_df, transformer)
        
    
    # use last few rows that cant fill up a complete batch as a smaller batch
    print("iteration",int(i/TEST_BATCH_SIZE)+1,"\twith batch from",i,"\t to", len(test_X_transformed_df),", training with",len(pltrain_y_original_df),"samples")
    testbatch_X_transformed_df = test_X_transformed_df.iloc[i:len(test_X_transformed_df)]

### 6.) iterative model training using pseudo-labeling
predict batches of the test set, add them to the previous training set and use this new training set to predict the next batch.

In [17]:
def get_extended_pltrain_for_batch(testbatch_X_unscaled_df, pltrain_X_unscaled_df, pltrain_y_df, transformer):
    
    #TODO: Use KNN (neighbour in following dataframe: knnwithprobs_Xy_original_df) to get best classifier: also use transformer.inverse_transform
    
    # train a classificator on the pseudo labeled train (pltrain) dataset
    clf = get_classifier('svc')
    clf.fit(pltrain_X_originial_df, pltrain_y_original_df.fraud)
    
    # predict labels for batch
    testbatch_y_original = clf.predict(testbatch_X_transformed_df)
    testbatch_Xy_transandorig_df = testbatch_X_transformed_df.assign(fraud = testbatch_y_original)
    
    # add batch to pseudo labeled train (pltrain) dataset. needs to ignore index as ids in test also start with 0
    pltrainnew_X_transformed_df = pltrain_X_transformed_df.append(testbatch_X_transformed_df, ignore_index=True)
    pltrainnew_y_original_df = pltrain_y_original_df.append(testbatch_Xy_transandorig_df[['fraud']], ignore_index=True)
    return pltrainnew_X_transformed_df, pltrainnew_y_original_df
    
    

print("total test size:",len(test_X_transformed_df),", with a batchsize of",TEST_BATCH_SIZE," we will need",int(np.ceil(len(test_X_transformed_df)/TEST_BATCH_SIZE)),"iterations:")

#initialize pseudo labeled train (pltrain) dataset with the transformed training data
pltrain_X_transformed_df = train_X_transformed_df.copy()
pltrain_y_original_df = train_y_original_df.copy()


# iterate through fixed-size batches
for i in range(TEST_BATCH_SIZE, len(test_X_transformed_df), TEST_BATCH_SIZE):
    if int(i/TEST_BATCH_SIZE) % ITER_PRINT_EVERY == 0:
        print("iteration",int(i/TEST_BATCH_SIZE),"\t/",int(np.ceil(len(test_X_transformed_df)/TEST_BATCH_SIZE)),"with batch from",i-TEST_BATCH_SIZE,"\t to", i,", training with",len(pltrain_y_original_df),"samples")
    # get batch from test set
    testbatch_X_transformed_df = test_X_transformed_df.iloc[i-TEST_BATCH_SIZE:i]
    
    # extend pseudo labeled train (pltrain) dataset by predicting the batch
    pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df, transformer)
    
    #if i>len(df_test_X_transformed)-1000:
    #    print(i)
    #    display(df_test_X_transformed_batch.head(1))

# use last few rows that cant fill up a complete batch as a smaller batch
print("iteration",int(i/TEST_BATCH_SIZE)+1,"\twith batch from",i,"\t to", len(test_X_transformed_df),", training with",len(pltrain_y_original_df),"samples")
testbatch_X_transformed_df = test_X_transformed_df.iloc[i:len(test_X_transformed_df)]

# extend pseudo labeled train (pltrain) dataset by predicting the small batch
pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df, transformer)

#combine x and y columns dataframes to one big dataframe
pltrain_Xy_transandorig_df = pltrain_X_transformed_df.assign(fraud = pltrain_y_original_df.fraud.values)

print("training with pseudo labeling completed, last iteration used",len(pltrain_Xy_transandorig_df),"samples.")

display(pltrain_Xy_transandorig_df.head(1))

total test size: 498121 , with a batchsize of 200  we will need 2491 iterations:
iteration 50 	/ 2491 with batch from 9800 	 to 10000 , training with 10927 samples
iteration 100 	/ 2491 with batch from 19800 	 to 20000 , training with 20927 samples
iteration 150 	/ 2491 with batch from 29800 	 to 30000 , training with 30927 samples
iteration 200 	/ 2491 with batch from 39800 	 to 40000 , training with 40927 samples
iteration 250 	/ 2491 with batch from 49800 	 to 50000 , training with 50927 samples
iteration 300 	/ 2491 with batch from 59800 	 to 60000 , training with 60927 samples
iteration 350 	/ 2491 with batch from 69800 	 to 70000 , training with 70927 samples
iteration 400 	/ 2491 with batch from 79800 	 to 80000 , training with 80927 samples


KeyboardInterrupt: 

### 7.) pseudo-label the test set and create new classifier based on this
first we predict the original test data labels using the new extended pltrain from above cell and second we use this test data labels to train a new classifier

In [None]:
# train a classificator on the pseudo labeled train (pltrain) dataset
pltrain_clf = get_classifier('svc')
pltrain_clf.fit(pltrain_X_transformed_df, pltrain_y_original_df.fraud)

# predict labels for (transformed) original test set
pltest_y_original = pltrain_clf.predict(test_X_transformed_df)

# combine x and y columns dataframes to one big dataframe
pltest_Xy_transandorig_df = test_X_transformed_df.assign(fraud = pltest_y_original)
display(pltest_Xy_transandorig_df.head(1))

# train a new classifier based on pltest
pltest_clf = get_classifier('svc')
pltest_clf.fit(test_X_transformed_df, pltest_y_original);

### 8.) evaluate our new classifier with the original training set

In [None]:
trainpred_y_original = pltest_clf.predict(train_X_transformed_df)
calc_scores(train_y_original_df.fraud.values, trainpred_y_original)

### 9.) combine the pseudo labeled test set with the original train data to train our final classifier

In [None]:
#--> already done in step 7
final_clf = pltrain_clf

### 10.) predict labels for the test set using our final classifier

In [None]:
#-->already done in step 7
test_y_pred = pltest_y_original

### 11.) generate output file neeeded for submission

In [None]:
pd.DataFrame(test_y_pred, columns=["fraud"]).to_csv("HS_Karlsruhe_1.csv", index=False)
pd.read_csv("HS_Karlsruhe_1.csv").head(5)

### 12.) evaluate our new classifier with the validation set
Now at the very end we can also test our final model on a validation split never used before. just for comparison. 

**For the final submission, the following code should will not be run and the full train (incl. this val split) set will be used above**

In [None]:
if not FINAL_SUBMISSION:
    val_Xy_original_df = pd.read_csv("../data/val_new.csv", sep="|")
    valpred_y_original = final_clf.predict(transformer.transform(val_Xy_original_df.drop("fraud", axis=1)))
    print(calc_scores(val_Xy_original_df.fraud.values, valpred_y_original))