# Train with the Test split?
This is what we want to do:

1. predict batches with ...
.TODO: create list of content

### 1.) set run-config and hyperparameters

In [1]:
FINAL_SUBMISSION = False # will perform a test on a validation split if set to False

TEST_BATCH_SIZE = 10000 # Number of Test entries to add to the training set for the next iteration
ITER_PRINT_EVERY = 2 # Which Iterations to print (every nth)

### 2.) import python modules

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

### 3.) define helper functions

In [3]:
def calc_scores(y_test, y_pred):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    confusion_matrix = (metrics.confusion_matrix(y_test, y_pred)).tolist()
    dmc_score = np.sum(confusion_matrix * np.array([[0, -25], [-5, 5]]))
    return accuracy, dmc_score, confusion_matrix

def find_nearest_neighbor_index(row, dataset):
    scaler = MinMaxScaler()
    dataset_scaled = scaler.fit_transform(dataset)
    row_scaled = scaler.transform([row])
    diffs = [np.sum((row_scaled[0] - ds_row)**2) for ds_row in dataset_scaled]
    return np.argmin(diffs)

def get_classifier(name):
    return {
        'xgb': XGBClassifier(),
        'svc': LinearSVC(C=0.8669055747631755, class_weight=None, dual=False,
                 fit_intercept=True, intercept_scaling=1.1311617930050963,
                 loss='squared_hinge', max_iter=20000, multi_class='ovr', penalty='l2',
                 random_state=None, tol=0.0039333067038518875, verbose=0)
    }[name]


### 4.) import datasets

In [4]:
#convention for variables names: datasetname_columntype_transformstatus_dataframeornot
trainandknn_Xy_original_df = pd.read_csv("../data/train.csv", sep="|") if FINAL_SUBMISSION else pd.read_csv("../data/train_new.csv", sep="|")
train_Xy_original_df, knn_Xy_original_df = train_test_split(df_train,train_size=0.8 if FINAL_SUBMISSION else 0.8**2) #small
test_X_original_df   = pd.read_csv("../data/test.csv", sep="|") #.iloc[0:301] #TODO: For faster testing we use less data from the test set
train_y_original_df = train_Xy_original_df[["fraud"]].copy()
train_X_original_df = train_Xy_original_df.copy().drop("fraud", axis=1)
knn_y_original_df = knn_Xy_original_df[["fraud"]].copy()
knn_X_original_df = knn_Xy_original_df.copy().drop("fraud", axis=1)

NameError: name 'train_test_split' is not defined

### 5.) prepare input data

In [None]:
class dataTransformer:
    """
    for scaling, data transformations (new features, one-hot encoding, categorical, ...)
    """
    scaler = MinMaxScaler()

    def fitScaler(self, df):
        self.scaler.fit(df.astype(np.float64))
        return self
        
    def applyScale(self, df):
        return pd.DataFrame(self.scaler.transform(df), df.index, df.columns)
    
    def inverseScale(self, df):
        return pd.DataFrame(self.scaler.inverse_transform(df), df.index, df.columns)
    
    def addFeatures(self,df):
        #TODO: Choose relevant features
        df['totalScannedLineItems'] = df['scannedLineItemsPerSecond'] * df['totalScanTimeInSeconds']
        #df['avgTimePerScan'] = 1/ df['scannedLineItemsPerSecond']
        #df['avgValuePerScan'] = df['avgTimePerScan'] * df['valuePerSecond']
        #df['withoutRegisPerPosition'] = df['scansWithoutRegistration'] / df['totalScannedLineItems'] #equivalent to lineItemVoidsPerPosition?
        #df['quantiModPerPosition'] = df['quantityModifications'] / df['totalScannedLineItems']
        #df['lineItemVoidsPerTotal'] = df['lineItemVoids'] / df['grandTotal']
        #df['withoutRegisPerTotal'] = df['scansWithoutRegistration'] / df['grandTotal']
        #df['quantiModPerTotal'] = df['quantityModifications'] / df['grandTotal']
        #df['lineItemVoidsPerTime'] = df['lineItemVoids'] / df['totalScanTimeInSeconds']
        #df['withoutRegisPerTime'] = df['scansWithoutRegistration'] / df['totalScanTimeInSeconds']
        #df['quantiModPerTime'] = df['quantityModifications'] / df['totalScanTimeInSeconds']
        #df['valuePerScannedLineItem'] = df['valuePerSecond'] / df['scannedLineItemsPerSecond']
        return df
    
    def transform(self, df):
        """
        All in one: Apply all transform methods
            1.) addFeatures
            2.) applyScale
        """
        return self.applyScale(self.addFeatures(df))
    

transformer = dataTransformer()
transformer.fitScaler(transformer.addFeatures(train_X_original_df.append(test_X_original_df, sort=False)))
train_X_transformed_df = transformer.transform(train_X_original_df.copy())
knn_X_transformed_df   = transformer.transform(knn_X_original_df.copy())
test_X_transformed_df  = transformer.transform(test_X_original_df.copy())

test_X_transformed_df.head(2)

### 5 1/2.) train normally with all available classifiers for classifying knn split

In [None]:
knnwithprobs_Xy_original_df = knn_Xy_original_df.copy()
#TODO: save predict_proba to knnwithprobs_Xy_original_df

### 6.) iterative model training using pseudo-labeling
predict batches of the test set, add them to the previous training set and use this new training set to predict the next batch.

In [None]:
def get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df, transformer):
    
    #TODO: Use KNN (neighbour in following dataframe: knnwithprobs_Xy_original_df) to get best classifier: also use transformer.inverse_transform
    
    # train a classificator on the pseudo labeled train (pltrain) dataset
    clf = get_classifier('svc')
    clf.fit(pltrain_X_transformed_df, pltrain_y_original_df.fraud)
    
    # predict labels for batch
    testbatch_y_original = clf.predict(testbatch_X_transformed_df)
    testbatch_Xy_transandorig_df = testbatch_X_transformed_df.assign(fraud = testbatch_y_original)
    
    # add batch to pseudo labeled train (pltrain) dataset. needs to ignore index as ids in test also start with 0
    pltrainnew_X_transformed_df = pltrain_X_transformed_df.append(testbatch_X_transformed_df, ignore_index=True)
    pltrainnew_y_original_df = pltrain_y_original_df.append(testbatch_Xy_transandorig_df[['fraud']], ignore_index=True)
    return pltrainnew_X_transformed_df, pltrainnew_y_original_df
    
    

print("total test size:",len(test_X_transformed_df),", with a batchsize of",TEST_BATCH_SIZE," we will need",int(np.ceil(len(test_X_transformed_df)/TEST_BATCH_SIZE)),"iterations:")

#initialize pseudo labeled train (pltrain) dataset with the transformed training data
pltrain_X_transformed_df = train_X_transformed_df.copy()
pltrain_y_original_df = train_y_original_df.copy()


# iterate through fixed-size batches
for i in range(TEST_BATCH_SIZE, len(test_X_transformed_df), TEST_BATCH_SIZE):
    if int(i/TEST_BATCH_SIZE) % ITER_PRINT_EVERY == 0:
        print("iteration",int(i/TEST_BATCH_SIZE),"\t/",int(np.ceil(len(test_X_transformed_df)/TEST_BATCH_SIZE)),"with batch from",i-TEST_BATCH_SIZE,"\t to", i,", training with",len(pltrain_y_original_df),"samples")
    # get batch from test set
    testbatch_X_transformed_df = test_X_transformed_df.iloc[i-TEST_BATCH_SIZE:i]
    
    # extend pseudo labeled train (pltrain) dataset by predicting the batch
    pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df, transformer)
    
    #if i>len(df_test_X_transformed)-1000:
    #    print(i)
    #    display(df_test_X_transformed_batch.head(1))

# use last few rows that cant fill up a complete batch as a smaller batch
print("iteration",int(i/TEST_BATCH_SIZE)+1,"\twith batch from",i,"\t to", len(test_X_transformed_df),", training with",len(pltrain_y_original_df),"samples")
testbatch_X_transformed_df = test_X_transformed_df.iloc[i:len(test_X_transformed_df)]

# extend pseudo labeled train (pltrain) dataset by predicting the small batch
pltrain_X_transformed_df, pltrain_y_original_df = get_extended_pltrain_for_batch(testbatch_X_transformed_df, pltrain_X_transformed_df, pltrain_y_original_df, transformer)

#combine x and y columns dataframes to one big dataframe
pltrain_Xy_transandorig_df = pltrain_X_transformed_df.assign(fraud = pltrain_y_original_df.fraud.values)

print("training with pseudo labeling completed, last iteration used",len(pltrain_Xy_transandorig_df),"samples.")

display(pltrain_Xy_transandorig_df.head(1))

### 7.) pseudo-label the test set and create new classifier based on this
first we predict the original test data labels using the new extended pltrain from above cell and second we use this test data labels to train a new classifier

In [None]:
# train a classificator on the pseudo labeled train (pltrain) dataset
pltrain_clf = get_classifier('svc')
pltrain_clf.fit(pltrain_X_transformed_df, pltrain_y_original_df.fraud)

# predict labels for (transformed) original test set
pltest_y_original = pltrain_clf.predict(test_X_transformed_df)

# combine x and y columns dataframes to one big dataframe
pltest_Xy_transandorig_df = test_X_transformed_df.assign(fraud = pltest_y_original)
display(pltest_Xy_transandorig_df.head(1))

# train a new classifier based on pltest
pltest_clf = get_classifier('svc')
pltest_clf.fit(test_X_transformed_df, pltest_y_original);

### 8.) evaluate our new classifier with the original training set

In [None]:
trainpred_y_original = pltest_clf.predict(train_X_transformed_df)
calc_scores(train_y_original_df.fraud.values, trainpred_y_original)

### 9.) combine the pseudo labeled test set with the original train data to train our final classifier

In [None]:
#--> already done in step 7
final_clf = pltrain_clf

### 10.) predict labels for the test set using our final classifier

In [None]:
#-->already done in step 7
test_y_pred = pltest_y_original

### 11.) generate output file neeeded for submission

In [None]:
pd.DataFrame(test_y_pred, columns=["fraud"]).to_csv("HS_Karlsruhe_1.csv", index=False)
pd.read_csv("HS_Karlsruhe_1.csv").head(5)

### 12.) evaluate our new classifier with the validation set
Now at the very end we can also test our final model on a validation split never used before. just for comparison. 

**For the final submission, the following code should will not be run and the full train (incl. this val split) set will be used above**

In [None]:
if not FINAL_SUBMISSION:
    val_Xy_original_df = pd.read_csv("../data/val_new.csv", sep="|")
    valpred_y_original = final_clf.predict(transformer.transform(val_Xy_original_df.drop("fraud", axis=1)))
    print(calc_scores(val_Xy_original_df.fraud.values, valpred_y_original))