In [None]:
!pip install pickle5

In [None]:
import numpy as np
import pandas as pd

import pickle5 as pickle
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import layers

import gc

In [None]:
with open("../input/tobigsmall2/total_data.pkl", 'rb') as f:
    total_data = pickle.load(f).reset_index(drop=True)


In [None]:
total_data.info()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
total_data = reduce_mem_usage(total_data)

In [None]:
from sklearn.model_selection import KFold
gc.collect()
kf = KFold(n_splits=10)
kf.get_n_splits(total_data)

In [None]:
sts =np.asarray([-0.5442, -2.0525, -0.7223, -0.6214,  0.1716, -0.6291, -0.5457,  0.9042,
        -1.0717,  0.0384,  0.7628,  2.5764,  0.2056, -1.3626, -0.1091, -0.3406,
         0.3204,  0.6171,  1.3046, -0.7506,  0.9644, -0.9718, -0.4879,  0.3154,
         0.7393, -0.3439, -1.1796,  0.4301,  2.8622,  1.0615, -0.5415,  0.9517,
         0.1726,  0.6590, -0.2736,  1.8027,  1.5056, -0.8677,  0.0509, -0.6124,
        -0.3626, -0.1705, -0.3911,  2.3072,  0.1888,  0.3702,  0.6663,  0.3871,
        -0.4989, -0.1548,  0.1163,  0.6468, -0.8557, -0.7491, -0.4844,  0.2717,
        -0.1045,  0.2871,  0.6565,  0.3478,  0.6403, -1.4068, -1.3599, -0.0916,
         0.1104, -0.9313, -0.6166, -0.3527,  1.8332,  0.0970, -1.4952,  0.2300,
         2.6711,  0.0620, -1.1795, -1.3079, -0.0095,  0.3101, -0.5438,  0.7995,
        -1.0275, -0.1489,  0.3743, -1.0226, -0.2782,  0.7352, -1.1051,  2.4946,
         1.4250, -1.6190,  0.2086,  0.1807,  0.0067,  0.4777, -1.6598, -0.0389,
        -1.4079,  0.3345, -0.2818,  0.4957, -0.1900, -0.4423,  0.8770, -1.0472,
        -0.7486, -0.7320, -0.1804,  0.9050, -0.8684, -1.4327,  0.5005, -1.2776,
        -1.5275,  0.3331, -1.0739, -0.1848,  0.6329,  1.1931,  0.9396, -1.0068,
         0.9022,  0.8124, -0.0861, -0.1056,  0.6030, -1.6492, -0.2849,  1.3909,
        -0.3802, -1.3738,  0.1878,  0.7794, -0.0931, -0.9017, -0.7826,  0.3928,
         0.6903,  1.5148,  0.1844, -0.8635,  0.4459,  0.2111,  0.9549, -0.8174,
        -0.7336, -2.3808, -0.8078,  0.2235,  0.2257,  0.4191,  0.7266, -0.8983,
        -0.9052,  0.5459, -0.6009,  0.3736,  0.8212, -1.6402,  0.2753, -0.6674,
        -0.0258,  0.1500, -1.1497, -1.9160,  0.9507, -0.7163,  0.9956, -0.9925,
         0.2852, -0.2510,  0.5756, -0.1644,  0.8531, -0.9882,  0.6097, -0.1257,
        -0.1917, -0.5951, -1.5515,  0.3298,  1.1871, -1.2918, -1.9231, -1.2691,
         0.7009,  0.7836, -0.8483, -0.1315, -0.5005, -1.0554, -0.1847, -0.0205,
        -0.5069, -0.4943, -0.3104,  0.8982,  0.5172,  0.4407,  0.4023, -0.5821,
         0.4916,  0.4998,  0.9022, -0.5322,  0.1212,  0.5185,  0.3125,  0.4570,
         0.0929,  1.3717, -0.2327,  0.7583, -0.2546,  1.3048,  0.2577,  0.8066,
         0.7498, -0.4798,  1.7619,  0.8426,  0.3806,  1.0426,  1.1559, -0.3951,
         1.1823, -0.5192, -1.9233,  0.5122, -0.1376, -0.9863, -0.8007,  0.1973,
        -1.8957, -0.6216, -0.0541,  0.6742,  0.4175, -0.1460,  0.6127,  0.2106,
         0.4756,  0.3582, -1.1231,  0.5368, -0.2647,  0.8672, -0.4317, -1.4344,
        -0.2527, -0.6239, -1.2426, -0.3676, -0.3519,  0.9459,  0.6487,  0.8836])

In [None]:

similarity = []
argnum = []

reconstructed_model = keras.models.load_model("../input/tobigsmall2/ff_model.h5")


kes = 1

for train_index, test_index in kf.split(total_data):
    
    print(kes)
    
    x11 = np.full((len(test_index),256),sts) #  np.asarray(total_data['tag_embedding'])[1]
    
    X2 = np.zeros((len(test_index),48,20))

    for num,i in enumerate(total_data['mel_embedding'][test_index.tolist()]):
        X2[num][:][:] = i
    
    X3 =np.asarray(total_data.iloc[test_index,4:259])
    
    X4 = np.asarray(total_data.iloc[test_index,259])
    
    predict = reconstructed_model.predict(x = (x11,X2,X3,X4),use_multiprocessing=True)
    
    
    similarity = similarity + np.sort(predict.reshape(-1))[-10:].tolist()
    
    argnum  =  argnum + np.argsort(predict.reshape(-1))[-10:].tolist()
    
    del predict
    del x11
    del X2
    del X3
    del X4
    gc.collect()
    
    kes += 1

    
        
   

In [None]:
result = pd.DataFrame({'predsim' : similarity,  'argnum' : argnum}).sort_values('predsim')[-10:]
result

In [None]:
np.asarray(result['argnum'])