In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os
import warnings
import math
import tensorflow as tf
import gc
from tensorflow.keras.layers import Input, Dense, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import Model
from sklearn.model_selection import KFold

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings("ignore")
tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
# !unzip /content/drive/MyDrive/kaggle/ubiquant-market-prediction.zip -d /content

Archive:  /content/drive/MyDrive/kaggle/ubiquant-market-prediction.zip
  inflating: /content/example_sample_submission.csv  
  inflating: /content/example_test.csv  
  inflating: /content/train.csv      
  inflating: /content/ubiquant/__init__.py  
  inflating: /content/ubiquant/competition.cpython-37m-x86_64-linux-gnu.so  


In [2]:
chunk_size = math.pow(10, 6)
chunks_path = os.path.join(os.getcwd(), "drive/MyDrive/kaggle/chunks")

In [None]:
def save_df_chunck(df: pd.DataFrame, index: int):
  data_columns = list(filter(lambda i: "f_" in i, df.columns.tolist()))
  print(f"Memory usage before df: {df.memory_usage().sum() / math.pow(1024, 2):.2f} MB")
  for i in data_columns:
    dtype = df[i].dtype
    if "float" in str(dtype):
      value_min = df[i].min()
      value_max = df[i].max()
      if value_min > np.finfo(np.float16).min and value_max < np.finfo(np.float16).max:
        df[i] = df[i].astype(np.float16)
      elif value_min > np.finfo(np.float32).min and value_max < np.finfo(np.float32).max:
        df[i] = df[i].astype(np.float32)
  print(f"Memory usage after df: {df.memory_usage().sum() / math.pow(1024, 2):.2f} MB")
  df.to_pickle(os.path.join(chunks_path, f"chunk_{index}.pkl"))

if not os.path.exists(chunks_path):
  os.mkdir(chunks_path)

for i, df in enumerate(pd.read_csv(os.path.join(os.getcwd(), "train.csv"), chunksize=chunk_size)):
  save_df_chunck(df, i)

In [3]:
!pip3 install pickle5
import pickle5 as pickle

Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[?25l[K     |█▎                              | 10 kB 19.4 MB/s eta 0:00:01[K     |██▋                             | 20 kB 11.5 MB/s eta 0:00:01[K     |███▉                            | 30 kB 9.4 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 8.4 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 4.4 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 5.2 MB/s eta 0:00:01[K     |█████████                       | 71 kB 5.6 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 5.8 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 6.4 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 5.3 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 5.3 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████▋           

In [5]:
dfs = []

for i, _ in enumerate(os.listdir(chunks_path)):
  path = os.path.join(chunks_path, f"chunk_{i}.pkl")
  with open(path, "rb") as fh:
    data = pickle.load(fh)
    dfs.append(data)
    # dfs.append(pd.read_pickle(os.path.join(chunks_path, f"chunk_{i}.pkl")))
    
train_csv = pd.concat(dfs, axis=0, ignore_index=True)
print(f"train_csv memory usage: {np.round(train_csv.memory_usage().sum() / math.pow(1024,2), 2)} MB")
data_columns = list(filter(lambda i: "f_" in i, train_csv.columns.tolist()))
df = train_csv[:100000] # processing all rows requires too much system RAM so we'll use 100k rows
print(f"df memory usage: {np.round(df.memory_usage().sum() / math.pow(1024,2), 2)} MB")
print(f"null values: {df.isna().sum().sum()}")

train_csv memory usage: 1893.4 MB
df memory usage: 60.27 MB
null values: 0


In [6]:
to_drop = []
for i in df.columns.tolist():
  if i not in data_columns:
    to_drop.append(i)

df_train = df.drop(to_drop, axis=1)
df_train.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,f_31,f_32,f_33,f_34,f_35,f_36,f_37,f_38,f_39,f_40,f_41,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,f_50,f_51,f_52,f_53,f_54,f_55,f_56,f_57,f_58,f_59,f_60,f_61,f_62,f_63,f_64,f_65,f_66,f_67,f_68,f_69,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,f_80,f_81,f_82,f_83,f_84,f_85,f_86,f_87,f_88,f_89,f_90,f_91,f_92,f_93,f_94,f_95,f_96,f_97,f_98,f_99,f_100,f_101,f_102,f_103,f_104,f_105,f_106,f_107,f_108,f_109,f_110,f_111,f_112,f_113,f_114,f_115,f_116,f_117,f_118,f_119,f_120,f_121,f_122,f_123,f_124,f_125,f_126,f_127,f_128,f_129,f_130,f_131,f_132,f_133,f_134,f_135,f_136,f_137,f_138,f_139,f_140,f_141,f_142,f_143,f_144,f_145,f_146,f_147,f_148,f_149,f_150,f_151,f_152,f_153,f_154,f_155,f_156,f_157,f_158,f_159,f_160,f_161,f_162,f_163,f_164,f_165,f_166,f_167,f_168,f_169,f_170,f_171,f_172,f_173,f_174,f_175,f_176,f_177,f_178,f_179,f_180,f_181,f_182,f_183,f_184,f_185,f_186,f_187,f_188,f_189,f_190,f_191,f_192,f_193,f_194,f_195,f_196,f_197,f_198,f_199,f_200,f_201,f_202,f_203,f_204,f_205,f_206,f_207,f_208,f_209,f_210,f_211,f_212,f_213,f_214,f_215,f_216,f_217,f_218,f_219,f_220,f_221,f_222,f_223,f_224,f_225,f_226,f_227,f_228,f_229,f_230,f_231,f_232,f_233,f_234,f_235,f_236,f_237,f_238,f_239,f_240,f_241,f_242,f_243,f_244,f_245,f_246,f_247,f_248,f_249,f_250,f_251,f_252,f_253,f_254,f_255,f_256,f_257,f_258,f_259,f_260,f_261,f_262,f_263,f_264,f_265,f_266,f_267,f_268,f_269,f_270,f_271,f_272,f_273,f_274,f_275,f_276,f_277,f_278,f_279,f_280,f_281,f_282,f_283,f_284,f_285,f_286,f_287,f_288,f_289,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0.932617,0.113708,-0.4021,0.378418,-0.203979,-0.413574,0.96582,1.230469,0.114807,-2.013672,0.004936,0.28418,0.501953,-0.287842,-1.168945,-0.267334,-0.574219,-0.771973,1.012695,-1.230469,1.786133,-2.089844,0.325684,-0.87793,1.048828,0.131714,-0.349609,-1.813477,0.099243,-0.240967,1.604492,0.003637,-0.901855,0.221558,0.609863,-0.73877,2.097656,-0.914062,-0.293945,-0.037994,0.685547,0.0,0.414795,-0.469482,-1.085938,-0.106445,0.059448,0.483154,1.189453,0.506836,0.754395,0.317627,-0.183228,5.164062,0.677246,-0.440674,0.631348,-1.845703,0.227783,0.841309,0.66748,-0.854004,-0.17041,-0.11969,-0.504883,0.662598,1.933594,-0.8125,0.072815,-0.778809,-0.553711,-0.032776,-0.619141,-1.424805,0.148315,0.257812,0.735352,0.563965,-0.272217,1.19043,-0.101379,1.333984,0.191284,0.630859,1.010742,0.161865,0.609375,-0.219482,-0.013069,1.505859,-1.319336,0.448242,-0.635254,-0.075378,1.033203,-0.664062,0.139893,0.186279,0.593262,-0.301514,0.666504,0.713379,1.020508,0.879883,-1.09668,-0.21106,-0.117493,-0.778809,0.065979,0.229736,0.435303,1.233398,-0.865723,0.062347,0.382324,-1.31543,0.786621,0.770508,-0.871094,0.583496,0.115601,0.209595,0.325439,0.469238,0.0,-0.731445,-0.249512,-0.050568,-1.232422,-0.790527,0.493408,0.35791,-0.381592,0.176392,0.748047,-0.741699,0.089661,1.080078,1.137695,1.199219,0.030609,-0.356201,0.247803,1.395508,0.961914,0.078125,-1.163086,0.70752,0.256104,-0.026306,0.329102,0.196045,0.65332,0.069214,0.907715,-0.350586,-1.170898,0.65918,0.9375,0.48291,-0.09491,0.317383,0.126831,0.89209,-1.199219,-0.31543,0.704102,-0.0177,-0.513184,-0.362549,1.0,2.265625,0.360107,0.054474,1.639648,-1.0,0.876953,0.368164,0.991211,-1.224609,-1.208984,-0.879883,1.0,0.035156,0.693359,0.307129,0.143799,0.728027,1.220703,-0.939941,-0.106934,0.0,-0.569336,0.187866,-0.386475,-0.452393,0.269775,-0.56543,-0.262207,-1.473633,0.0,0.428223,1.740234,0.941406,-0.72998,-0.533203,-0.983398,-0.043152,-0.225952,-0.008301,-0.447998,0.55127,-0.153076,-2.292969,0.186646,-0.443115,0.121216,0.787598,1.996094,-1.286133,-0.157227,1.019531,0.693359,0.797363,-0.192627,0.222778,0.921387,-0.171387,0.021652,1.15332,0.689941,3.251953,0.992188,-0.779785,-0.546387,-0.37915,1.896484,-0.732422,-0.11377,1.12793,-0.007927,0.122314,0.44458,0.48584,-2.302734,0.392822,0.0,-0.779297,-0.893555,0.055939,0.201416,-0.338135,1.27832,1.367188,0.525391,-0.055664,0.145264,0.346924,-0.824219,-1.306641,-1.158203,0.709473,-0.031891,-1.020508,-1.291016,0.038666,0.187134,-0.680176,0.900391,-0.924805,-1.057617,-0.167114,0.0,1.28125,0.258789,-0.237915,-0.742188,-0.324707,0.992676,0.961426,-0.025604,-0.00626,0.473633,0.040131,0.453613,-1.597656,0.301758,0.157471,0.416748,1.505859,0.365967,-1.095703,0.200073,0.819336,0.941406,-0.086792,-1.086914,-1.044922,-0.287598,0.321533
1,0.811035,-0.51416,0.742188,-0.616699,-0.194214,1.771484,1.427734,1.133789,0.114807,-0.219238,-0.351807,0.84668,0.440186,0.499756,0.893066,-0.010216,-0.681641,1.253906,-1.027344,-1.69043,0.011154,0.875488,0.325684,-0.458252,-1.797852,-0.300293,0.584961,0.55127,0.806641,1.235352,-0.984863,-1.084961,3.162109,0.21106,-2.65625,-0.177002,0.486572,1.237305,-0.44751,-0.403564,-0.769531,1.0,-0.516602,-1.307617,-0.092407,0.971191,-0.069336,-0.963379,-0.84082,0.438965,0.317139,0.13855,-1.167969,0.082336,1.05957,-0.440674,0.631348,0.495117,1.163086,0.453369,0.631836,-0.651855,-0.17041,-0.11969,0.186157,-0.29248,-0.549805,-1.629883,0.902832,0.509766,0.377441,0.052094,-0.763184,-1.307617,0.148315,0.777832,-0.383301,-0.099487,-0.119995,1.466797,-0.255371,0.136353,0.191284,-0.404541,-1.220703,-1.442383,1.088867,-0.479248,0.476318,-1.238281,0.13916,-1.238281,0.277344,0.925781,-0.666016,-0.496582,-1.344727,-0.51416,-0.081726,-0.301514,-0.918945,-0.051697,1.020508,0.746582,0.911621,-0.343994,-0.948242,-0.778809,0.065979,-0.22998,-0.153564,-0.722168,-0.947266,1.321289,-0.075256,0.99707,-0.137695,0.724121,-0.722168,0.541992,0.730469,0.478027,-0.662109,0.605957,0.0,0.280518,0.781738,-0.140503,0.324951,0.540527,-0.686523,0.388916,-1.551758,-0.662109,0.312256,-1.089844,0.530762,-0.730469,-0.878906,-1.139648,-1.336914,0.804688,0.247803,-1.292969,-0.892578,0.078125,0.023346,-0.957031,-0.042419,-0.195312,0.329102,-0.181152,0.65332,0.069214,-0.818848,0.308838,0.894531,0.01049,0.904297,1.197266,1.052734,0.915039,-1.083984,-0.541992,-0.898926,-0.043518,-0.375732,0.167969,0.552246,0.772949,1.0,-1.25,0.409912,-0.695312,1.639648,-1.0,0.876953,-1.523438,-0.526855,-0.071045,-0.404785,0.269531,1.0,0.035156,-0.309326,-1.067383,1.087891,-1.145508,-0.02533,-0.398438,-1.173828,0.427002,0.043945,0.510742,1.408203,-0.71875,2.076172,-0.507812,0.942383,1.02832,0.0,1.050781,-1.009766,0.480713,-1.363281,-0.358398,0.0849,-0.012634,-1.496094,-1.260742,-0.452148,-0.253418,-1.107422,0.492188,0.727051,0.400635,-2.302734,1.443359,-0.231445,0.777344,1.995117,-0.59375,0.737793,-0.841797,-0.191528,0.222778,0.872559,0.611328,-0.113159,1.15332,0.866211,-0.269775,-0.078003,-0.332764,-0.546387,-0.678711,-0.434082,-0.584473,0.332031,-0.646973,-0.176514,0.122314,-0.068176,-0.868164,0.844727,0.863281,1.179688,-0.049469,0.484863,-0.050537,-0.491943,-0.164429,0.133667,-0.952637,-0.387451,0.090637,-0.811035,-0.821289,-0.546875,-0.067871,-0.76709,-0.015457,-0.158325,0.980469,0.799316,0.79834,-0.633301,0.779785,0.171265,1.166016,0.59082,0.11853,0.0,-0.650879,0.852051,0.086182,1.135742,0.299072,-1.583008,-0.481934,0.532227,0.226685,-0.894531,-0.514648,-1.0,0.884277,-0.557617,-0.875488,-0.156128,0.537109,-0.154175,0.912598,-0.734375,0.819336,0.941406,-0.387695,-1.086914,-0.929688,-0.974121,-0.343506
2,0.394043,0.615723,0.567871,-0.60791,0.068909,-1.083008,0.979492,-1.125977,0.114807,-1.035156,0.465088,0.150879,-0.044006,0.091248,-1.168945,-0.451904,-0.467285,0.095276,1.140625,-0.16687,-0.007294,-0.449463,0.325684,-0.682617,0.016266,0.026123,-0.547363,0.55127,-0.261475,-0.169678,0.857422,-0.09845,-0.860352,0.180176,0.609863,0.524902,-0.400391,0.751465,-0.287598,0.568359,0.829102,0.0,-0.253174,-0.235474,0.500977,-0.655762,-0.750977,0.664062,1.189453,1.087891,0.859375,-0.133545,-0.183228,0.082336,-0.060181,-0.440674,0.631348,-0.026169,0.015625,-0.855957,1.052734,-0.526855,-0.17041,-0.11969,0.038177,0.309814,-0.549805,1.638672,-0.476562,-0.90918,-0.544922,-0.546387,0.163452,1.748047,0.148315,0.365234,0.342773,-0.813965,-0.046417,-0.484863,-0.270264,0.136353,-1.441406,-0.079163,-0.530762,0.130981,0.221558,-0.228882,0.700195,0.023468,0.269287,0.039825,-0.963867,-0.790039,0.693359,-0.59375,0.295166,-0.096008,0.447754,-0.301514,1.039062,-0.291748,1.020508,0.529297,-1.09668,0.150757,-0.074646,1.254883,0.065979,0.869141,0.518066,0.494385,-0.02858,-0.478271,-0.733887,0.200806,-0.174561,-0.255127,0.558594,0.586426,-2.177734,-0.765137,0.200317,0.346436,0.0,-0.065796,-0.29834,-0.040955,-0.875977,-0.625,1.005859,-0.471924,0.788574,-0.547363,-0.039612,-0.885254,-0.161743,0.186157,1.137695,0.052643,0.323242,1.15625,0.247803,0.859375,0.873047,0.078125,-0.250977,-0.418213,0.927246,0.648438,0.329102,0.015572,0.65332,0.069214,0.907715,-0.642578,-1.170898,-0.100586,0.122498,-0.247559,0.694336,-0.167358,1.0625,0.718262,-0.844238,-0.105347,0.166626,-0.375732,-0.513184,-0.393555,1.0,0.596191,-0.446289,-0.115051,-0.709961,-1.0,-1.431641,-0.577637,-0.418457,-0.515137,-0.307617,-0.283691,1.0,0.035156,-0.393799,0.288086,0.366455,0.103333,-0.098328,0.133911,0.21228,0.384521,0.501465,-0.339355,0.010094,-0.007168,-1.117188,-0.564941,0.672852,-0.775879,0.0,-0.070007,0.883301,0.579102,0.942871,-0.793457,-0.955078,0.131836,-0.546875,0.905273,-0.452148,0.675293,-0.036041,-1.169922,-0.167603,-0.274414,0.114014,0.10321,0.071045,-1.286133,-0.407959,0.137207,-0.399414,0.335693,-0.149292,0.222778,0.156128,-0.171387,0.258301,1.15332,-0.964355,-0.269775,-0.654297,0.180542,-1.220703,0.45752,-0.461182,-0.976562,0.147339,0.060364,-0.271729,0.122314,1.450195,1.163086,0.402588,-0.101135,0.0,0.019119,0.523438,-0.005413,-0.092407,-0.001884,0.259277,0.507324,0.202515,-0.37085,0.828125,0.25708,0.027908,-0.662598,1.066406,-0.52832,0.189453,-1.020508,0.169678,-0.911133,-0.222046,0.536621,-0.161743,-0.114319,0.43335,-0.207153,0.0,0.098938,-0.725098,-0.388184,0.062622,0.260254,0.980957,0.899414,-0.31543,0.150146,0.245605,-1.429688,-1.0,-0.033508,-0.147095,-0.087524,0.09845,-0.528809,-0.138062,0.912598,-0.551758,-1.220703,-1.060547,-0.219116,-1.086914,-0.612305,-0.113953,0.243652


In [7]:
df_investment_id = df["investment_id"]
integer_lookup_layer = tf.keras.layers.IntegerLookup()
integer_lookup_layer.adapt(pd.DataFrame({"investment_id": list(df_investment_id.unique())}))

df_target = pd.DataFrame(df["target"])
df_target.head(3)

Unnamed: 0,target
0,-0.300875
1,-0.23104
2,0.568807


In [23]:
def create_model_1():
  activation_func = tf.keras.activations.relu
  investment_id_input = tf.keras.layers.Input(shape=(1,))
  x_input = tf.keras.layers.Input(shape=(len(data_columns),))
  x = tf.keras.layers.Dense(128, activation=activation_func)(x_input)
  x = tf.keras.layers.Dense(128, activation=activation_func)(x)

  investment_id_x = integer_lookup_layer(investment_id_input)
  investment_id_x = tf.keras.layers.Embedding(len(df_investment_id.unique()) + 1, 32, input_length=1)(investment_id_x) # shape (None, 1, 32)
  investment_id_x = tf.keras.layers.Reshape((-1, ))(investment_id_x) # shape (None, 32)
  investment_id_x = tf.keras.layers.Dense(128, activation=activation_func)(investment_id_x)
  investment_id_x = tf.keras.layers.Dense(128, activation=activation_func)(investment_id_x)

  concat = tf.keras.layers.Concatenate(axis=1)([investment_id_x, x])
  concat = tf.keras.layers.Dense(128, activation=activation_func)(concat)
  concat = tf.keras.layers.Dense(32, activation=activation_func)(concat)
  output = tf.keras.layers.Dense(1)(concat)

  model = Model(inputs=[investment_id_input, x_input], outputs=[output])
  model.compile(
    optimizer=tf.keras.optimizers.Adam(), 
    loss=tf.keras.losses.MeanSquaredError(), 
    metrics=tf.keras.metrics.RootMeanSquaredError()
  )
  return model

In [24]:
def create_ds(X: pd.DataFrame, investment_id: pd.DataFrame,  y: pd.DataFrame):
  ds = tf.data.Dataset.from_tensor_slices(((investment_id, X), y["target"]))
  ds = ds.shuffle(128)
  ds = ds.batch(32)
  ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  return ds

epochs = 5
random_state = 1
n_splits = 3
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
split = kfold.split(df_train[data_columns], df_investment_id)

for i, (train_index, test_index) in enumerate(split):
  X_train, X_test = df_train.iloc[train_index], df_train.iloc[test_index]
  y_train, y_test = df_target.iloc[train_index], df_target.iloc[test_index]
  investment_id_train, investment_id_test = df_investment_id.iloc[train_index], df_investment_id.iloc[test_index]
  ds_train = create_ds(X_train, investment_id_train, y_train)
  ds_test = create_ds(X_test, investment_id_test, y_test)
  model = create_model_1()
  model.fit(ds_train, epochs=epochs, validation_data=ds_test, callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
      filepath = f"model_split_{i}",
      save_best_only=True,
    )
  ])

  del ds_train, ds_test, model
  gc.collect()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
def predict(models, ds):
  predictions = []
  for model in models:
    predictions.append(model.predict(ds))
  return np.mean(predictions, axis=0)

def create_predict_ds(X: pd.DataFrame, investment_id: pd.DataFrame):
  ds = tf.data.Dataset.from_tensor_slices(((investment_id, X)))
  ds = ds.batch(32)
  ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
  return ds

models = []

for i in range(n_splits):
  models.append(tf.keras.models.load_model(f"model_split_{i}"))

df_test = None # insert df here
ds_test = create_predict_ds = create_predict_ds(df_test)
predict(models, ds_test)