In [152]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from keras.callbacks import EarlyStopping,ModelCheckpoint

In [153]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [154]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [156]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [157]:
downloaded = drive.CreateFile({'id':'1RVKXO2IV-LoJIw8iBgAtZkhwkYuEKgW7'}) # replace the id with id of file you want to access
downloaded.GetContentFile('challenge_data.csv') 

In [158]:
downloaded = drive.CreateFile({'id':'1ybDiG60_B18Zz9tRgcektPp6U-LR6DnR'}) # replace the id with id of file you want to access
downloaded.GetContentFile('test.csv') 


In [159]:
downloaded = drive.CreateFile({'id':'1JDXjuJNDDhrRA7IhitdxWXo5YPE7n8mI'}) # replace the id with id of file you want to access
downloaded.GetContentFile('train.csv') 


In [160]:
from keras import models
from keras import layers
from keras import optimizers
import tensorflow as tf

In [161]:
def apk(actual, predicted, k=3):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    actual = list(actual)
    predicted = list(predicted)
    
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
            
    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [162]:
# read in the train file
train = pd.read_csv("train.csv")

In [163]:
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [164]:
train.shape

(903916, 4)

In [165]:
# how many unique users
train.user_id.nunique()

69532

In [166]:
# how many unique  challenges
train.challenge.nunique()

5348

In [167]:
# read in the test file
test = pd.read_csv("test.csv")

In [168]:
test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [169]:
# how many users in here?
test.user_id.nunique()

39732

In [170]:
# read in the challenge detail files
details = pd.read_csv("challenge_data.csv")

In [171]:
details.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


# First Method - Multi Class Classification

In [172]:
# first label encode the challenges
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [173]:
# instantiate label encoder
le = LabelEncoder()

#label encoding to be done on concatenation of train and test
train_test_challenges = pd.concat([train.challenge,test.challenge])

In [174]:
train_test_challenges

0         CI23714
1         CI23855
2         CI24917
3         CI23663
4         CI23933
           ...   
397315    CI23691
397316    CI24138
397317    CI23714
397318    CI24530
397319    CI23612
Name: challenge, Length: 1301236, dtype: object

In [175]:
le.fit(train_test_challenges)

LabelEncoder()

In [176]:
len(le.classes_)

5502

In [177]:
le.transform(le.classes_)

array([   0,    1,    2, ..., 5499, 5500, 5501])

In [178]:
train1 = train.copy()

In [179]:
train1["challenge"] = le.transform(train1["challenge"])

In [180]:
train1

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,233
1,4576_2,4576,2,374
2,4576_3,4576,3,1421
3,4576_4,4576,4,182
4,4576_5,4576,5,451
...,...,...,...,...
903911,113839_9,113839,9,1496
903912,113839_10,113839,10,5033
903913,113839_11,113839,11,5056
903914,113839_12,113839,12,2875


In [181]:
# prepare data 

catch = []
for i in tqdm(np.unique(train1["user_id"])): 
    temp_data = train1[train1.user_id == i]
    # get the items he or she has used
    items_used = temp_data.challenge.tolist()
    catch.append({"user_id": i, "sequence_challenges": items_used})

HBox(children=(FloatProgress(value=0.0, max=69532.0), HTML(value='')))




In [182]:
train_valid_sequence = pd.DataFrame(catch)

In [183]:
train_valid_sequence

Unnamed: 0,user_id,sequence_challenges
0,4576,"[233, 374, 1421, 182, 451, 1636, 493, 1627, 14..."
1,4580,"[182, 374, 451, 493, 1039, 233, 167, 300, 186,..."
2,4581,"[2643, 2644, 2645, 2646, 2647, 2648, 2649, 265..."
3,4582,"[374, 1419, 1421, 451, 182, 1462, 493, 233, 14..."
4,4585,"[374, 493, 1421, 1636, 367, 233, 182, 451, 146..."
...,...,...
69527,113833,"[451, 1039, 1799, 1380, 2547, 946, 1575, 1030,..."
69528,113835,"[374, 451, 1759, 288, 1527, 1526, 1379, 1373, ..."
69529,113836,"[746, 1435, 2542, 3402, 1461, 1462, 517, 1434,..."
69530,113837,"[399, 451, 2543, 1370, 1551, 2174, 2169, 1380,..."


In [184]:
import random

In [185]:
random.seed(42)

In [186]:
train_users = random.sample(set(train_valid_sequence.user_id), k = round(0.95*len(train_valid_sequence.user_id)))

In [187]:
test_users = set(train_valid_sequence.user_id).difference(set(train_users))

In [188]:
train_data = train_valid_sequence[train_valid_sequence.user_id.isin(train_users)].reset_index(drop = True)

In [189]:
valid_data = train_valid_sequence[train_valid_sequence.user_id.isin(test_users)].reset_index(drop = True)

In [190]:
# one hot encode

use_sequence_train = [i[:10] for i in train_data["sequence_challenges"]]

target_sequence_train = [i[10:13] for i in train_data["sequence_challenges"]]

In [191]:
use_sequence_valid = [i[:10] for i in valid_data["sequence_challenges"]]

target_sequence_valid = [i[10:13] for i in valid_data["sequence_challenges"]]

In [None]:
# # prepare the training data - one hot encode the training sequence 
# # and one hot encode the target sequence as well
# # but make it three target sequences for each user

# # one_hot_encoded_array = []
# # one_hot_encoded_targets_array = []
# one_hot_encoded = np.zeros((len(train_users)*3, len(le.classes_))).astype("float32")
# one_hot_encoded_targets_array = np.zeros((len(train_users)*3, len(le.classes_))).astype("float32")

# counter = 0
# for i,n in tqdm(enumerate(train_data.user_id)):
    
    
#     sequence = use_sequence_train[i]
#     target_used = target_sequence_train[i]
    
#     for j in sequence:
#         one_hot_encoded[i:i+3,j] = 1
        
    
    
#     for k,l in enumerate(target_used):
#         one_hot_encoded_targets = np.zeros((1, len(le.classes_))).astype("float32")
#         one_hot_encoded_targets[0,l] = 1
# #         one_hot_encoded_array.append(one_hot_encoded.tolist())
#         one_hot_encoded_targets_array[counter,:] = one_hot_encoded_targets
#         counter = counter + 1

In [None]:
# # prepare the training data - one hot encode the training sequence 
# # and one hot encode the target sequence as well
# # but make it three target sequences for each user

# # one_hot_encoded_array_valid = []
# # one_hot_encoded_targets_array = []
# one_hot_encoded_valid = np.zeros((len(test_users)*3, len(le.classes_))).astype("float32")
# one_hot_encoded_targets_array_valid = np.zeros((len(test_users)*3, len(le.classes_))).astype("float32")

# counter = 0
# for i,n in tqdm(enumerate(valid_data.user_id)):
    
    
#     sequence = use_sequence_valid[i]
#     target_used = target_sequence_valid[i]
    
#     for j in sequence:
#         one_hot_encoded_valid[i:i+3,j] = 1
        
    
    
#     for k,l in enumerate(target_used):
#         one_hot_encoded_targets = np.zeros((1, len(le.classes_))).astype("float32")
#         one_hot_encoded_targets[0,l] = 1
# #         one_hot_encoded_array.append(one_hot_encoded.tolist())
#         one_hot_encoded_targets_array_valid[counter,:] = one_hot_encoded_targets
#         counter = counter + 1

In [192]:
import keras

# Second Method - Maybe RNN

In [193]:
use_sequence_train = [i for i in use_sequence_train for j in range(3)]

In [None]:
# use_sequence_train = [i for i in use_sequence_train for j in range(3)]

In [194]:
use_sequence_train = np.array(use_sequence_train)

In [195]:
use_sequence_train.shape

(198165, 10)

In [196]:
target_sequence_train = [[j] for i in target_sequence_train for j in i]

In [None]:
np.array(target_sequence_train).shape

(594495, 1)

In [197]:
target_sequence_train = np.array(target_sequence_train)

In [198]:
target_sequence_train.shape

(198165, 1)

In [None]:
# np.expand_dims(target_sequenddce_train,1).shape

In [None]:
# use_sequence_train = np.expand_dims(np.array(use_sequence_train),2)

In [199]:
use_sequence_valid = [i for i in use_sequence_valid for j in range(3)]

In [None]:
# use_sequence_valid = np.expand_dims(np.array(use_sequence_valid),2)

In [200]:
use_sequence_valid = np.array(use_sequence_valid)

In [201]:
use_sequence_valid.shape

(10431, 10)

In [202]:
use_sequence_valid.shape

(10431, 10)

In [203]:
target_sequence_valid = [[j] for i in target_sequence_valid for j in i]



In [204]:
target_sequence_valid = np.array(target_sequence_valid)

In [205]:
target_sequence_valid.shape

(10431, 1)

In [None]:
# use_sequence_train = np.concatenate((use_sequence_train,use_sequence_train)).astype("float32")

In [None]:
# one_hot_encoded_targets_array = np.concatenate((one_hot_encoded_targets_array,one_hot_encoded_targets_array)).astype("float32")

In [206]:
es_callback = EarlyStopping(monitor="val_loss", patience=2)

In [207]:
use_sequence_train.shape

(198165, 10)

In [208]:
use_sequence_train = np.concatenate((use_sequence_train,use_sequence_train,use_sequence_train))

In [209]:
target_sequence_train = np.concatenate((target_sequence_train,target_sequence_train,target_sequence_train))

In [210]:
target_sequence_train.shape

(594495, 1)

In [211]:
target_sequence_train.shape

(594495, 1)

In [212]:
model = models.Sequential()
model.add(layers.Embedding(input_dim=len(le.classes_), output_dim=1280, input_length=10))
model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.LSTM(512, dropout=0.1, recurrent_dropout=0.5))
model.add(layers.Bidirectional(layers.LSTM(758, dropout=0.1, recurrent_dropout=0.5, return_sequences = False)))
# model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.Bidirectional(layers.LSTM(units=256, dropout=0.1, recurrent_dropout=0.5, return_sequences =False)))
# model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.5))
# model.add(layers.Bidirectional(layers.GRU(units=256, dropout=0.1, recurrent_dropout=0.5)))
# model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(512))
# model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(1024))
# model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.Dense(1024))
# model.add(layers.BatchNormalization())
# model.add(layers.Dropout(0.5))
# model.add(layers.BatchNormalization())
model.add(layers.Dense(len(le.classes_), activation='softmax'))
opt = optimizers.Adam(lr=0.0001)
# opt = optimizers.RMSprop(lr=0.0001)
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer=opt)

In [213]:
es_callback = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights = True)

In [214]:
model.fit(use_sequence_train, target_sequence_train,epochs=2000,batch_size=1024,
          validation_data=(use_sequence_valid, target_sequence_valid), 
         callbacks = [es_callback], shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 594495 samples, validate on 10431 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000


<keras.callbacks.callbacks.History at 0x7ff962e865f8>

In [215]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [216]:
model.save("/content/gdrive/My Drive/best_so_far_with_shuffle_gayara.h5")

In [None]:
model.save(r"D:\AV_Rec_Sys\best_so_far_with_shuffle.h5")

In [234]:
es_callback = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights = True)

In [238]:
opt = optimizers.Adam(lr=0.00001)
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer=opt,
              metrics=['accuracy'])

In [239]:
model.fit(use_sequence_train, target_sequence_train,epochs=2000,batch_size=1024,
          validation_data=(use_sequence_valid, target_sequence_valid), 
         callbacks = [es_callback], shuffle=True)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 594495 samples, validate on 10431 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000


<keras.callbacks.callbacks.History at 0x7ff9586a6f60>

In [240]:
model.save(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_lr_gayara_1.h5")

In [None]:
es_callback = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights = True)

In [None]:
# model.fit(use_sequence_train, one_hot_encoded_targets_array,epochs=20,batch_size=256,
#           validation_data=(use_sequence_valid, one_hot_encoded_targets_array_valid), 
#          callbacks = [es_callback], shuffle=True)

In [None]:
# model.save(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_lr_best_weights.h5")

In [None]:
model = keras.models.load_model(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_final_on_valid.h5")

In [None]:
# model.fit(use_sequence_valid, one_hot_encoded_targets_array_valid,epochs=2,batch_size=256, shuffle=True)

In [None]:
model.save(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_final_on_valid.h5")

In [None]:
# model.fit(use_sequence_valid, one_hot_encoded_targets_array_valid,epochs=5,batch_size=256 )

In [None]:
model = keras.models.load_model(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_lr.h5")

In [None]:
model = keras.models.load_model(r"D:\AV_Rec_Sys\best_so_far_with_shuffle_final_on_valid.h5")

In [None]:
pred_proba = model.predict(np.array(use_sequence_valid))

In [None]:
pred_proba

In [None]:
pred_proba.shape

In [None]:
preds = pred_proba.argsort(axis = 1)[:,-3:][:,::-1]

In [None]:
preds = [i.tolist() for i in preds]

In [None]:
mapk(target_sequence_valid, preds)

In [241]:
test

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,374
1,4577_2,4577,2,451
2,4577_3,4577,3,1421
3,4577_4,4577,4,1419
4,4577_5,4577,5,233
...,...,...,...,...
397315,113838_6,113838,6,210
397316,113838_7,113838,7,655
397317,113838_8,113838,8,233
397318,113838_9,113838,9,1039


In [242]:
test["challenge"] = le.transform(test["challenge"])

  mask &= (ar1 != a)


ValueError: ignored

In [243]:
test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,374
1,4577_2,4577,2,451
2,4577_3,4577,3,1421
3,4577_4,4577,4,1419
4,4577_5,4577,5,233


In [244]:
# prepare data 

catch = []
for i in tqdm(np.unique(test["user_id"])): 
    temp_data = test[test.user_id == i]
    # get the items he or she has used
    items_used = temp_data.challenge.tolist()
    catch.append({"user_id": i, "sequence_challenges": items_used})

HBox(children=(FloatProgress(value=0.0, max=39732.0), HTML(value='')))




In [245]:
test_sequence = pd.DataFrame(catch)

In [246]:
test_sequence.head()

Unnamed: 0,user_id,sequence_challenges
0,4577,"[374, 451, 1421, 1419, 233, 182, 1462, 1636, 2..."
1,4578,"[182, 374, 1421, 451, 493, 233, 1636, 1419, 14..."
2,4579,"[3421, 3422, 3423, 3424, 3425, 3426, 3427, 342..."
3,4583,"[182, 374, 493, 233, 367, 451, 1636, 300, 1039..."
4,4584,"[374, 493, 1636, 367, 233, 1421, 447, 2228, 16..."


In [247]:
use_sequence_test = [i[:10] for i in test_sequence["sequence_challenges"]]

In [248]:
np.array(use_sequence_test).shape

(39732, 10)

In [250]:
# test_sequence_temp.shape

In [None]:
# # write a function to use predictions to make further predictions

# preds = []

# for i,n in tqdm(enumerate(test_sequence.user_id)):
#     preds_user = []
#     test_sequence_temp = np.array(test_sequence[test_sequence.user_id == n]["sequence_challenges"].tolist())
# #     tensor_obj = np.expand_dims(np.array(test_sequence_temp),0)
#     first_pred_proba = model.predict(test_sequence_temp)
#     preds_test_1 = first_pred_proba.argsort(axis = 1)[:,-3:][:,::-1][0][0]
    
#     # make new sequence
#     new_sequence = [test_sequence_temp.tolist()[0][1:], [preds_test_1]]
#     new_sequence = [j for i in new_sequence for j in i]
#     new_sequence = np.expand_dims(np.array(new_sequence),0)
    
#     second_pred_proba = model.predict(new_sequence)
#     preds_test_2 = second_pred_proba.argsort(axis = 1)[:,-3:][:,::-1][0][0]
    
#     new_sequence = [test_sequence_temp.tolist()[0][2:], [preds_test_1, preds_test_2]]
#     new_sequence = [j for i in new_sequence for j in i]
#     new_sequence = np.expand_dims(np.array(new_sequence),0)
    
#     third_pred_proba = model.predict(new_sequence)
#     preds_test_3 = third_pred_proba.argsort(axis = 1)[:,-3:][:,::-1][0][0]
    
    
#     preds_user.append([preds_test_1, preds_test_2, preds_test_3])
    
# #     dict1 = {""}
    
#     preds.append(preds_user[0])

In [None]:
# np.array(preds)

In [251]:
pred_proba_test = model.predict(np.array(use_sequence_test))

In [252]:
preds_test = pred_proba_test.argsort(axis = 1)[:,-3:][:,::-1]

In [None]:
preds_test.shape

In [None]:
# preds_test = np.array(preds)

In [253]:
preds_names = [le.inverse_transform(i.tolist()).tolist() for i in preds_test]

In [None]:
preds_names

In [None]:
preds_names

In [254]:
preds_names_flat = [j for i in preds_names for j in i]

In [None]:
preds_names_flat

In [255]:
names1_catch = []
for i in test_sequence.user_id: 
    names1 = [str(i)+ "_"+ str(11 + j) for j in range(3)]
    names1_catch.extend(names1)

In [None]:
names1_catch


In [256]:
preds_own = pd.DataFrame({"user_sequence": names1_catch, "challenge": preds_names_flat})

In [258]:
preds_own.to_csv("/content/gdrive/My Drive/best_so_far_gayara_1695.csv", index = False)