In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn # machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

scikit-learn version: 0.24.1
/kaggle/input/tabular-playground-series-jun-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-jun-2021/train.csv
/kaggle/input/tabular-playground-series-jun-2021/test.csv
/kaggle/input/tps6-xgboost-commented/__results__.html
/kaggle/input/tps6-xgboost-commented/my_submission.csv
/kaggle/input/tps6-xgboost-commented/add_feat_train.npy
/kaggle/input/tps6-xgboost-commented/__notebook__.ipynb
/kaggle/input/tps6-xgboost-commented/__output__.json
/kaggle/input/tps6-xgboost-commented/add_feat_test.npy
/kaggle/input/tps6-xgboost-commented/custom.css
/kaggle/input/tps6-xgboost-commented/__results___files/__results___15_0.png
/kaggle/input/tps6-xgboost-commented/__results___files/__results___24_0.png
/kaggle/input/tps6-xgboost-commented/__results___files/__results___21_1.png


# About
The TPS6 data has duplicates in train and test set. I.e. samples with identical features are present in training and testing set. The idea is to use postprocessing on those samples to set the predicted class probablity to the class from the training data.

In [2]:
df_train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv') 

# label encode the target column
le = LabelEncoder()
df_train.target = le.fit_transform(df_train.target)

In [3]:
# read in a real submission that shall be altered
submission = pd.read_csv('../input/tps6-xgboost-commented/my_submission.csv')

In [4]:
# check what model predicted for duplicates between train and test, here the ground truth is known
# get the duplicates
feature_cols = [col for col in df_train.columns if col.startswith("feature")]
df_train_temp = df_train.drop(columns="id").drop_duplicates(subset=feature_cols) # remove all duplicates from train
df_test_temp = df_test.drop(columns="id").drop_duplicates() # remove all duplicates from test
df_all_temp = df_train_temp.append(df_test_temp, ignore_index = True) 

dupli_train_part = df_all_temp[df_all_temp.duplicated(subset=feature_cols, keep="last")].copy() # these are the training rows that have a duplicate in the test set
dupli_test_part = df_all_temp[df_all_temp.duplicated(subset=feature_cols, keep="first")].copy() # these are the testing rows that have a duplicate in the train set

# prepare for feature concatenation
target = dupli_train_part.target
dupli_train_part.drop(columns="target", inplace=True)
dupli_test_part.drop(columns="target", inplace=True)
#dupli_test_part.head()

In [5]:
# function that extracts all items from a given row, converts them to string, concatenates them and returns them
def replace(row):
    string = ""
    for item in row:
        string = string + str(item)
    return string

#replace(dupli_train_part.iloc[1])

In [6]:
# apply fuction to each row in df: concatenate all features in a string which allows sorting
concat_train = dupli_train_part.apply(lambda row: replace(row), axis=1).to_frame(name="concat_feat")
concat_test = dupli_test_part.apply(lambda row: replace(row), axis=1).to_frame(name="concat_feat")
concat_train['target'] = target # add target column again
#concat_test

# sort values so that the duplicates are in the correct sequence
concat_train.reset_index(inplace=True, drop=False)
concat_train = concat_train.sort_values(by="concat_feat", ignore_index=True)
concat_train.columns = ["id","concat_feat","target"]
transfer = concat_train.target # get target from train duplicates....
#display(concat_train)

concat_test.reset_index(inplace=True, drop=False)
concat_test = concat_test.sort_values(by="concat_feat", ignore_index=True)
concat_test["target"] = transfer # ... and transfer it to test duplicates
concat_test.columns = ["id","concat_feat","target"]
concat_test.target = concat_test.target.astype(int)
# transform concat_test to be able to use .loc later
concat_test.target = le.inverse_transform(concat_test.target) # get the "Class_x" back
concat_test.id = concat_test.id -200000
concat_test.set_index("id", inplace=True)
concat_test
# these are the indexes and ground truth target based on training set, 
# just remember that training set had "feature duplicates" with different target so only half of them might be true

Unnamed: 0_level_0,concat_feat,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
21375,0000000000000000000000000000000000000000000000...,Class_2
7225,0000000000000000000000000000000000000000000000...,Class_6
40833,0000000000000000000000000000000000000000000000...,Class_2
43568,0000000000000000000000000000000000000000000000...,Class_6
66847,0000000000000000000000000000000000000000000000...,Class_9
...,...,...
3493,0000000000100000000010000000000000000000000000...,Class_7
39827,0000000000100000000010000000000000000001000000...,Class_9
50451,0000000000100000000010000000000000001001000000...,Class_9
82551,0000000000200000000110000000000000000001000000...,Class_2


In [7]:
# check what was predicted for the duplicates
index_to_replace = concat_test.index
class_labels = ["Class_1","Class_2","Class_3","Class_4","Class_5","Class_6","Class_7","Class_8","Class_9"]
submission.iloc[index_to_replace].head().style.background_gradient(axis=1, subset=class_labels)

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
21375,221375,0.037142,0.04704,0.032607,0.013726,0.017211,0.436236,0.065272,0.252476,0.098291
7225,207225,0.045536,0.047215,0.033617,0.023409,0.02297,0.318886,0.098204,0.300863,0.1093
40833,240833,0.046894,0.098213,0.064004,0.021554,0.01514,0.118845,0.089283,0.395417,0.150651
43568,243568,0.046823,0.158445,0.104087,0.023877,0.012503,0.251795,0.064973,0.20099,0.136508
66847,266847,0.043046,0.035823,0.027343,0.013642,0.008205,0.352822,0.067126,0.36748,0.084513


In [8]:
submission["top_label"] = submission.drop(columns="id").aggregate('idxmax', axis=1)
submission.iloc[index_to_replace]

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,top_label
21375,221375,0.037142,0.047040,0.032607,0.013726,0.017211,0.436236,0.065272,0.252476,0.098291,Class_6
7225,207225,0.045536,0.047215,0.033617,0.023409,0.022970,0.318886,0.098204,0.300863,0.109300,Class_6
40833,240833,0.046894,0.098213,0.064004,0.021554,0.015140,0.118845,0.089283,0.395417,0.150651,Class_8
43568,243568,0.046823,0.158445,0.104087,0.023877,0.012503,0.251795,0.064973,0.200990,0.136508,Class_6
66847,266847,0.043046,0.035823,0.027343,0.013642,0.008205,0.352822,0.067126,0.367480,0.084513,Class_8
...,...,...,...,...,...,...,...,...,...,...,...
3493,203493,0.040378,0.040728,0.042015,0.023587,0.022029,0.297606,0.078024,0.341353,0.114279,Class_8
39827,239827,0.055201,0.289596,0.145743,0.027647,0.014889,0.112353,0.051587,0.153839,0.149144,Class_2
50451,250451,0.039976,0.059311,0.041819,0.021757,0.014569,0.367797,0.076471,0.268725,0.109575,Class_6
82551,282551,0.052744,0.070466,0.051740,0.029819,0.019000,0.249467,0.106195,0.285310,0.135257,Class_8


In [9]:
submission["supposed_target"]="t"
submission.loc[index_to_replace,"supposed_target"]=concat_test.target
submission.iloc[index_to_replace]

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,top_label,supposed_target
21375,221375,0.037142,0.047040,0.032607,0.013726,0.017211,0.436236,0.065272,0.252476,0.098291,Class_6,Class_2
7225,207225,0.045536,0.047215,0.033617,0.023409,0.022970,0.318886,0.098204,0.300863,0.109300,Class_6,Class_6
40833,240833,0.046894,0.098213,0.064004,0.021554,0.015140,0.118845,0.089283,0.395417,0.150651,Class_8,Class_2
43568,243568,0.046823,0.158445,0.104087,0.023877,0.012503,0.251795,0.064973,0.200990,0.136508,Class_6,Class_6
66847,266847,0.043046,0.035823,0.027343,0.013642,0.008205,0.352822,0.067126,0.367480,0.084513,Class_8,Class_9
...,...,...,...,...,...,...,...,...,...,...,...,...
3493,203493,0.040378,0.040728,0.042015,0.023587,0.022029,0.297606,0.078024,0.341353,0.114279,Class_8,Class_7
39827,239827,0.055201,0.289596,0.145743,0.027647,0.014889,0.112353,0.051587,0.153839,0.149144,Class_2,Class_9
50451,250451,0.039976,0.059311,0.041819,0.021757,0.014569,0.367797,0.076471,0.268725,0.109575,Class_6,Class_9
82551,282551,0.052744,0.070466,0.051740,0.029819,0.019000,0.249467,0.106195,0.285310,0.135257,Class_8,Class_2


In [10]:
# check how many predictions favor the same label as has the feature duplicate from train set
#submission[submission.top_label == submission.supposed_target]
# not many, but try to set maximum confidence on those and see what happens
index_to_replace= submission[submission.top_label == submission.supposed_target].index

In [11]:
submission.drop(columns=["top_label","supposed_target"], inplace=True)

In [12]:
# replace values in submission
for col in class_labels:
    submission.loc[index_to_replace,col] = 0.01  # set all to 0.1 first   
    
for ele in index_to_replace:
    submission.loc[ele,concat_test.loc[ele,"target"]] = 0.92 # set "ground truth probability"

# create submission file 2 - very confident
submission.to_csv("my_submission_vc.csv", index=False)
submission.iloc[index_to_replace]

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
1888,201888,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
7225,207225,0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01
11412,211412,0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01
12788,212788,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
15403,215403,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
16210,216210,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
16265,216265,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
24980,224980,0.01,0.92,0.01,0.01,0.01,0.01,0.01,0.01,0.01
38563,238563,0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01
43568,243568,0.01,0.01,0.01,0.01,0.01,0.92,0.01,0.01,0.01


In [13]:
# set more conservative values
for col in class_labels:
    submission.loc[index_to_replace,col] = 0.05  # set all to 0.1 first   
    
for ele in index_to_replace:
    submission.loc[ele,concat_test.loc[ele,"target"]] = 0.6 # set "ground truth probability"

# create submission file 3 - conservative
submission.to_csv("my_submission_cons.csv", index=False)
submission.iloc[index_to_replace]

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9
1888,201888,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
7225,207225,0.05,0.05,0.05,0.05,0.05,0.6,0.05,0.05,0.05
11412,211412,0.05,0.05,0.05,0.05,0.05,0.6,0.05,0.05,0.05
12788,212788,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
15403,215403,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
16210,216210,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
16265,216265,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
24980,224980,0.05,0.6,0.05,0.05,0.05,0.05,0.05,0.05,0.05
38563,238563,0.05,0.05,0.05,0.05,0.05,0.6,0.05,0.05,0.05
43568,243568,0.05,0.05,0.05,0.05,0.05,0.6,0.05,0.05,0.05


In [14]:
proba = [[0.01, 0.94, 0.01, 0.2],[0.01, 0.04, 0.92, 0.03]]
gt = [[0,1,0,0], [0,0,1,0]]
log_loss(gt,proba, normalize=True)

0.14683850888770583