In [1]:
#import riiideducation
#import dask.dataframe as dd
import  pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import explained_variance_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import warnings
import gc
warnings.filterwarnings('ignore')
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
dir_path = 'riiid_data/'
file_train = 'train.csv'
file_questions = 'questions.csv'
file_lecture = 'lectures.csv'
nrows =  100*1000

In [3]:
train_df = pd.read_csv(
                    dir_path + file_train, 
                    nrows=nrows, 
                    usecols=['row_id', 'timestamp', 'user_id', 'content_id', 
                             'content_type_id', 'task_container_id', 'answered_correctly',
                            'prior_question_elapsed_time','prior_question_had_explanation'],
                    dtype={
                            'row_id': 'int64',
                            'timestamp': 'int64',
                            'user_id': 'int32',
                            'content_id': 'int16',
                            'content_type_id': 'int8',
                            'task_container_id': 'int8',
                            'answered_correctly': 'int8',
                            'prior_question_elapsed_time': 'float32',
                            'prior_question_had_explanation': 'str'
                        }
                   )
#train = train[train.content_type_id == False]
train_df = train_df.sort_values(['timestamp'], ascending=True)
#train_df.drop(['timestamp'], axis=1,   inplace=True)


questions_df = pd.read_csv(
                        dir_path + file_questions, 
                        nrows=nrows,
                        usecols=['question_id','bundle_id','part'], 
                        dtype={
                            'question_id': 'int16',
                            'bundle_id': 'int8',
                            'part': 'int8',
                       }
                    )

lectures_df = pd.read_csv(dir_path + file_lecture)
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna('False').map({"True":True,"False":False})

In [4]:
# extract lecture

lectures_df['type_of'] = lectures_df['type_of'].replace('solving question', 'solving_question')

lectures_df = pd.get_dummies(lectures_df, columns=['part', 'type_of'])

part_lectures_columns = [column for column in lectures_df.columns if column.startswith('part')]

types_of_lectures_columns = [column for column in lectures_df.columns if column.startswith('type_of_')]

train_lectures = train_df[train_df.content_type_id == True].merge(lectures_df, left_on='content_id', right_on='lecture_id', how='left')

user_lecture_stats_part = train_lectures.groupby('user_id')[part_lectures_columns + types_of_lectures_columns].sum()

# add boolean features
for column in user_lecture_stats_part.columns:
    bool_column = column + '_boolean'
    user_lecture_stats_part[bool_column] = (user_lecture_stats_part[column] > 0).astype(int)
    
del(train_lectures)
train_df = train_df[train_df.content_type_id == False].sort_values('timestamp').reset_index(drop = True)
elapsed_mean = train_df.prior_question_elapsed_time.mean()

In [5]:
# extract train

group1 = train_df.loc[(train_df.content_type_id == False), ['task_container_id', 'user_id']].groupby(['task_container_id']).agg(['count'])
group1.columns = ['avg_questions']

group2 = train_df.loc[(train_df.content_type_id == False), ['task_container_id', 'user_id']].groupby(['task_container_id']).agg(['nunique'])
group2.columns = ['avg_questions']

group3 = group1 / group2
group3['avg_questions_seen'] = group3.avg_questions.cumsum()

results_u_final = train_df.loc[train_df.content_type_id == False, ['user_id','answered_correctly']].groupby(['user_id']).agg(['mean'])
results_u_final.columns = ['answered_correctly_user']

results_u2_final = train_df.loc[train_df.content_type_id == False, ['user_id','prior_question_had_explanation']].groupby(['user_id']).agg(['mean'])
results_u2_final.columns = ['explanation_mean_user']

In [6]:
# extract question
train_df = pd.merge(train_df, questions_df, left_on = 'content_id', right_on = 'question_id', how = 'left')

results_q_final = train_df.loc[train_df.content_type_id == False, ['question_id','answered_correctly']].groupby(['question_id']).agg(['mean'])
results_q_final.columns = ['quest_pct']

results_q2_final = train_df.loc[train_df.content_type_id == False, ['question_id','part']].groupby(['question_id']).agg(['count'])
results_q2_final.columns = ['count']

question2 = pd.merge(questions_df, results_q_final, left_on = 'question_id', right_on = 'question_id', how = 'left')

question2 = pd.merge(question2, results_q2_final, left_on = 'question_id', right_on = 'question_id', how = 'left')

question2.quest_pct = round(question2.quest_pct,5)

prior_mean_user = results_u2_final.explanation_mean_user.mean()

train_df.drop(['timestamp', 'part', 'question_id', 'part', 'bundle_id'], axis=1, inplace=True)

In [7]:
# extract valid
'''
validation = train_df.groupby('user_id').tail(5)
train = train_df[~train_df.index.isin(validation.index)]


results_u_val = train_df[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean'])
results_u_val.columns = ['answered_correctly_user']

results_u2_val = train_df[['user_id','prior_question_had_explanation']].groupby(['user_id']).agg(['mean'])
results_u2_val.columns = ['explanation_mean_user']

validation = pd.merge(validation, group3, left_on=['task_container_id'], right_index= True, how="left")
validation = pd.merge(validation, results_u_val, on=['user_id'], how="left")
validation = pd.merge(validation, results_u2_val, on=['user_id'], how="left")

validation = pd.merge(validation, user_lecture_stats_part, on=['user_id'], how="left")
'''

'\nvalidation = train_df.groupby(\'user_id\').tail(5)\ntrain = train_df[~train_df.index.isin(validation.index)]\n\n\nresults_u_val = train_df[[\'user_id\',\'answered_correctly\']].groupby([\'user_id\']).agg([\'mean\'])\nresults_u_val.columns = [\'answered_correctly_user\']\n\nresults_u2_val = train_df[[\'user_id\',\'prior_question_had_explanation\']].groupby([\'user_id\']).agg([\'mean\'])\nresults_u2_val.columns = [\'explanation_mean_user\']\n\nvalidation = pd.merge(validation, group3, left_on=[\'task_container_id\'], right_index= True, how="left")\nvalidation = pd.merge(validation, results_u_val, on=[\'user_id\'], how="left")\nvalidation = pd.merge(validation, results_u2_val, on=[\'user_id\'], how="left")\n\nvalidation = pd.merge(validation, user_lecture_stats_part, on=[\'user_id\'], how="left")\n'

In [8]:
# extract train
X = train_df.iloc[:,:]
#train_df = train_df[~train_df.index.isin(X.index)]

results_u_X = train_df[['user_id','answered_correctly']].groupby(['user_id']).agg(['mean'])
results_u_X.columns = ['answered_correctly_user']

results_u2_X = train_df[['user_id','prior_question_had_explanation']].groupby(['user_id']).agg(['mean'])
results_u2_X.columns = ['explanation_mean_user']

X = pd.merge(X, group3, left_on=['task_container_id'], right_index= True, how="left")
X = pd.merge(X, results_u_X, on=['user_id'], how="left")
X = pd.merge(X, results_u2_X, on=['user_id'], how="left")
X = pd.merge(X, user_lecture_stats_part, on=['user_id'], how="left")

In [9]:
content_mean = question2.quest_pct.mean()
question2.quest_pct.mean()
#there are a lot of high percentage questions, should use median instead?

#filling questions with no info with a new value
question2.quest_pct = question2.quest_pct.mask((question2['count'] < 3), .65)


#filling very hard new questions with a more reasonable value
question2.quest_pct = question2.quest_pct.mask((question2.quest_pct < .2) & (question2['count'] < 21), .2)

#filling very easy new questions with a more reasonable value
question2.quest_pct = question2.quest_pct.mask((question2.quest_pct > .95) & (question2['count'] < 21), .95)

X = pd.merge(X, question2, left_on = 'content_id', right_on = 'question_id', how = 'left')

# validation = pd.merge(validation, question2, left_on = 'content_id', right_on = 'question_id', how = 'left')
X.part = X.part - 1
# validation.part = validation.part - 1

In [10]:
y = X['answered_correctly']
X = X.drop(['answered_correctly'], axis=1)
#y_val = validation['answered_correctly']
#X_val = validation.drop(['answered_correctly'], axis=1)

# Filling with 0.5 for simplicity; there could likely be a better value
X['answered_correctly_user'].fillna(0.65,  inplace=True)
X['explanation_mean_user'].fillna(prior_mean_user,  inplace=True)
X['quest_pct'].fillna(content_mean, inplace=True)

X['part'].fillna(4, inplace = True)
X['avg_questions_seen'].fillna(1, inplace = True)
X['prior_question_elapsed_time'].fillna(elapsed_mean, inplace = True)
#X['prior_question_had_explanation_enc'].fillna(0, inplace = True)

X['part_1'].fillna(0, inplace = True)
X['part_2'].fillna(0, inplace = True)
X['part_3'].fillna(0, inplace = True)
X['part_4'].fillna(0, inplace = True)
X['part_5'].fillna(0, inplace = True)
X['part_6'].fillna(0, inplace = True)
X['part_7'].fillna(0, inplace = True)
X['type_of_concept'].fillna(0, inplace = True)
X['type_of_intention'].fillna(0, inplace = True)
X['type_of_solving_question'].fillna(0, inplace = True)
X['type_of_starter'].fillna(0, inplace = True)
X['part_1_boolean'].fillna(0, inplace = True)
X['part_2_boolean'].fillna(0, inplace = True)
X['part_3_boolean'].fillna(0, inplace = True)
X['part_4_boolean'].fillna(0, inplace = True)
X['part_5_boolean'].fillna(0, inplace = True)
X['part_6_boolean'].fillna(0, inplace = True)
X['part_7_boolean'].fillna(0, inplace = True)
X['type_of_concept_boolean'].fillna(0, inplace = True)
X['type_of_intention_boolean'].fillna(0, inplace = True)
X['type_of_solving_question_boolean'].fillna(0, inplace = True)
X['type_of_starter_boolean'].fillna(0, inplace = True)

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98182 entries, 0 to 98181
Data columns (total 38 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   row_id                            98182 non-null  int64  
 1   user_id                           98182 non-null  int32  
 2   content_id                        98182 non-null  int16  
 3   content_type_id                   98182 non-null  int8   
 4   task_container_id                 98182 non-null  int8   
 5   prior_question_elapsed_time       98182 non-null  float32
 6   prior_question_had_explanation    98182 non-null  bool   
 7   avg_questions                     98182 non-null  float64
 8   avg_questions_seen                98182 non-null  float64
 9   answered_correctly_user           98182 non-null  float64
 10  explanation_mean_user             98182 non-null  float64
 11  part_1                            98182 non-null  float64
 12  part

In [12]:
cat_columns = ['prior_question_had_explanation','bundle_id','user_id', 'content_id', 'task_container_id']

cont_columns = ['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_questions_seen','prior_question_elapsed_time', 'part',
                'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
               'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
               'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
               'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']

#cont_columns = ['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_questions_seen','prior_question_elapsed_time', ]

features=cat_columns+cont_columns

In [13]:
def encode(df,cols):
    enc =  {}
    for col in cols:
        print(col)
        lbencoder = LabelEncoder()
        lb = lbencoder.fit(df[col].values)
        df[col]=lb.transform(df[col].values)
        enc[col]=lb
        
    return df,enc

X,enc_dict = encode(X,cat_columns)

scale_dict={}
fix_missing={}


for col in cont_columns:
    scaler = RobustScaler()
    scale_dict[col]=scaler.fit(X[col].values.reshape(-1,1))
    X[col] = scale_dict[col].transform(X[col].values.reshape(-1,1))
    fix_missing[col] = X[col].mode()


prior_question_had_explanation
bundle_id
user_id
content_id
task_container_id


In [14]:
cat_dims = [X[col].nunique() for col in cat_columns]
cat_embs = [(dim, min(50,(dim+1)//2)) for dim in cat_dims]
cat_embs

[(2, 1), (256, 50), (349, 50), (11321, 50), (256, 50)]

In [15]:
class RidDataset(Dataset):
    def __init__(self, df,targets,cat_features,cont_features,mode='train'):
        self.mode = mode
        self.data_cont = df[cont_features].values
        self.data_cat = df[cat_features].values
        if mode=='train':
            self.targets = targets.values 
        print(self.data_cont.shape)
        print(self.data_cat.shape)
        print(self.targets.shape)
    def __len__(self):
        return len(self.data_cont)
    
    def __getitem__(self, idx):
        if self.mode == 'train':
            sample = {'data_cont': self.data_cont[idx], 'data_cat':self.data_cat[idx], 'target' : self.targets[idx]}
            return sample
        elif self.mode == 'test':
            sample = {'data_cont': self.data_cont[idx], 'data_cat':self.data_cat[idx], 'target' : 0}
            #return torch.FloatTensor(self.data_cont[idx]), torch.LongTensor(self.data_cat[idx]), 0
            return sample

In [16]:
def collater(data):
    data_conts = [d_i['data_cont'] for d_i in data]
    
    data_cats = [d_i['data_cat'] for d_i in data]
    
    targets = [d_i['target'] for d_i in data]
    
    data_conts = np.stack(data_conts, axis=0)
    
    data_cats  =np.stack(data_cats, axis=0)
    
    targets  = np.stack(targets, axis=0)
    
    return {'data_cont' : torch.FloatTensor(data_conts), 'data_cat' : torch.LongTensor(data_cats), 'target' :  torch.FloatTensor(targets)} 

In [17]:
# model
'''
class RidModel(nn.Module):
    def __init__(self,emb_dims,no_of_cont):
        super(RidModel, self).__init__()
        
        self.emb = nn.ModuleList([nn.Embedding(x,y) for x,y in emb_dims])
        
        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont
        
        
        self.batch_norm1 = nn.BatchNorm1d(self.no_of_cont)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(no_of_cont, 128))
        
        self.batch_norm2 = nn.BatchNorm1d(128+no_of_embs)
        self.dense2 = nn.utils.weight_norm(nn.Linear(128+no_of_embs, 32))
         
        self.batch_norm3 = nn.BatchNorm1d(32)
        self.dense3 = nn.utils.weight_norm(nn.Linear(32, 16))
        
        self.batch_norm4 = nn.BatchNorm1d(16)
        self.dense4 = nn.utils.weight_norm(nn.Linear(16, 1))
        
       
    def forward(self,cont,cat):
         
        ## cat data part
        x_cat = [emb_layer(cat[:,i]) for i,emb_layer in enumerate(self.emb)]
        x_cat = torch.cat(x_cat,1)
        x_cat = self.dropout1(x_cat)
        ##cont data
        x = self.batch_norm1(cont)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        ##concat
        x = torch.cat([x,x_cat],1)
        
        ##rest of NN
        x = self.batch_norm2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = F.relu(self.dense3(x))
        
        
        x = self.batch_norm4(x)
        x = F.sigmoid(self.dense4(x))
        
        return x
'''

'\nclass RidModel(nn.Module):\n    def __init__(self,emb_dims,no_of_cont):\n        super(RidModel, self).__init__()\n        \n        self.emb = nn.ModuleList([nn.Embedding(x,y) for x,y in emb_dims])\n        \n        no_of_embs = sum([y for x, y in emb_dims])\n        self.no_of_embs = no_of_embs\n        self.no_of_cont = no_of_cont\n        \n        \n        self.batch_norm1 = nn.BatchNorm1d(self.no_of_cont)\n        self.dropout1 = nn.Dropout(0.2)\n        self.dense1 = nn.utils.weight_norm(nn.Linear(no_of_cont, 128))\n        \n        self.batch_norm2 = nn.BatchNorm1d(128+no_of_embs)\n        self.dense2 = nn.utils.weight_norm(nn.Linear(128+no_of_embs, 32))\n         \n        self.batch_norm3 = nn.BatchNorm1d(32)\n        self.dense3 = nn.utils.weight_norm(nn.Linear(32, 16))\n        \n        self.batch_norm4 = nn.BatchNorm1d(16)\n        self.dense4 = nn.utils.weight_norm(nn.Linear(16, 1))\n        \n       \n    def forward(self,cont,cat):\n         \n        ## cat data

In [18]:
#X_train,X_valid,y_train,y_valid = train_test_split(X[features],y,test_size=0.15)

In [19]:
#del X,y,train_df
#gc.collect()

In [20]:
#display(X_train, y_train)

In [21]:
# train
'''
nepochs=50
train_set = RidDataset(X_train,y_train,cat_columns,cont_columns,mode="train")
valid_set = RidDataset(X_valid,y_valid,cat_columns,cont_columns,mode="train")
val_auc=[]
dataloaders = {'train':DataLoader(train_set,batch_size=2**10,shuffle=True,drop_last = True, collate_fn = collater),
              "val":DataLoader(valid_set,batch_size=2**10,shuffle=True, drop_last = True, collate_fn = collater)}

model = RidModel(cat_embs,len(cont_columns)).to(DEVICE)
checkpoint_path = 'rid_model.pt'
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, eps=1e-4, verbose=True)
criterion = nn.BCELoss()
best_loss = {'train':np.inf,'val':np.inf}
auc_score = {'train':0,'val':0.0}

for epoch in range(nepochs):
    epoch_loss = {'train': 0.0, 'val': 0.0}

    for phase in ['train', 'val']:
        if phase == 'train':
            model.train()
        else:
            model.eval()

        running_loss = 0.0
        auc=0.0

        for i,data in enumerate(dataloaders[phase]):
            x, y, z = data['data_cont'].to(DEVICE), data['data_cat'].to(DEVICE),data['target'].to(DEVICE)
            optimizer.zero_grad()
            with torch.set_grad_enabled(phase=='train'):
                preds = model(x,y)
                loss = criterion(preds, z)
                try:
                    auc = roc_auc_score(z.detach().cpu().numpy(),preds.detach().cpu().numpy())
                except ValueError:
                    pass
                

                if phase=='train':
                    loss.backward()
                    optimizer.step()
                #print(auc)

            running_loss += loss.item() / len(dataloaders[phase])
            auc += auc/len(dataloaders[phase])

        epoch_loss[phase] = running_loss
        auc_score[phase]=auc

    print("Epoch {}/{}   - loss: {:5.5f}   - val_loss: {:5.5f} -- AUC {:5.4f} --val AUC {:5.4f}".format(epoch+1,
            nepochs, epoch_loss['train'], epoch_loss['val'],auc_score['train'],auc_score['val']))
    val_auc.append(auc_score['val'])
    scheduler.step(epoch_loss['val'])

    if epoch_loss['val'] < best_loss['val']:
        best_loss = epoch_loss
        torch.save(model.state_dict(), checkpoint_path)
'''


'\nnepochs=50\ntrain_set = RidDataset(X_train,y_train,cat_columns,cont_columns,mode="train")\nvalid_set = RidDataset(X_valid,y_valid,cat_columns,cont_columns,mode="train")\nval_auc=[]\ndataloaders = {\'train\':DataLoader(train_set,batch_size=2**10,shuffle=True,drop_last = True, collate_fn = collater),\n              "val":DataLoader(valid_set,batch_size=2**10,shuffle=True, drop_last = True, collate_fn = collater)}\n\nmodel = RidModel(cat_embs,len(cont_columns)).to(DEVICE)\ncheckpoint_path = \'rid_model.pt\'\noptimizer = optim.Adam(model.parameters())\nscheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode=\'min\', factor=0.1, patience=5, eps=1e-4, verbose=True)\ncriterion = nn.BCELoss()\nbest_loss = {\'train\':np.inf,\'val\':np.inf}\nauc_score = {\'train\':0,\'val\':0.0}\n\nfor epoch in range(nepochs):\n    epoch_loss = {\'train\': 0.0, \'val\': 0.0}\n\n    for phase in [\'train\', \'val\']:\n        if phase == \'train\':\n            model.train()\n        else:\n          

In [22]:
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *
cat_columns = ['prior_question_had_explanation','bundle_id','user_id', 'content_id', 'task_container_id']

cont_columns = ['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_questions_seen','prior_question_elapsed_time', 'part',
                'part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6', 'part_7',
               'type_of_concept', 'type_of_intention', 'type_of_solving_question', 'type_of_starter',
               'part_1_boolean', 'part_2_boolean', 'part_3_boolean', 'part_4_boolean', 'part_5_boolean', 'part_6_boolean', 'part_7_boolean',
               'type_of_concept_boolean', 'type_of_intention_boolean', 'type_of_solving_question_boolean', 'type_of_starter_boolean']
#cont_columns = ['answered_correctly_user', 'explanation_mean_user', 'quest_pct', 'avg_questions_seen','prior_question_elapsed_time', ]


In [23]:
'''
def get_xy_fd():
    feature_columns = [SparseFeat('prior_question_had_explanation', X_train['prior_question_had_explanation'].nunique(), embedding_dim=8), 
                       SparseFeat('bundle_id', X_train['bundle_id'].nunique(), embedding_dim=8),
                       SparseFeat('user_id',  X_train['user_id'].nunique(), embedding_dim=8), 
                       SparseFeat('content_id', X_train['content_id'].nunique(), embedding_dim=8),
                       SparseFeat('task_container_id', X_train['task_container_id'].nunique(), embedding_dim=8),
                       DenseFeat('answered_correctly_user', 1),
                       DenseFeat('explanation_mean_user', 1),
                       DenseFeat('quest_pct', 1),
                       DenseFeat('avg_questions_seen', 1),
                       DenseFeat('prior_question_elapsed_time', 1),
                      ]
    behavior_feature_list = [ "bundle_id", 'user_id'] 
    #print(X_train['prior_question_had_explanation'].values)
    feature_dict = {'prior_question_had_explanation': X_train['prior_question_had_explanation'], 
                    'bundle_id': X_train['bundle_id'],
                    'user_id':  X_train['user_id'],
                    'content_id': X_train['content_id'],
                    'task_container_id': X_train['task_container_id'],
                    'answered_correctly_user': X_train['answered_correctly_user'],
                    'explanation_mean_user': X_train['explanation_mean_user'],
                    'quest_pct':X_train['quest_pct'],
                    'avg_questions_seen':X_train['avg_questions_seen'],
                     'prior_question_elapsed_time':X_train['prior_question_elapsed_time'],
                   }
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = y_train.values
    return x, y, feature_columns,behavior_feature_list
x, y, feature_columns,behavior_feature_list = get_xy_fd()
'''


'\ndef get_xy_fd():\n    feature_columns = [SparseFeat(\'prior_question_had_explanation\', X_train[\'prior_question_had_explanation\'].nunique(), embedding_dim=8), \n                       SparseFeat(\'bundle_id\', X_train[\'bundle_id\'].nunique(), embedding_dim=8),\n                       SparseFeat(\'user_id\',  X_train[\'user_id\'].nunique(), embedding_dim=8), \n                       SparseFeat(\'content_id\', X_train[\'content_id\'].nunique(), embedding_dim=8),\n                       SparseFeat(\'task_container_id\', X_train[\'task_container_id\'].nunique(), embedding_dim=8),\n                       DenseFeat(\'answered_correctly_user\', 1),\n                       DenseFeat(\'explanation_mean_user\', 1),\n                       DenseFeat(\'quest_pct\', 1),\n                       DenseFeat(\'avg_questions_seen\', 1),\n                       DenseFeat(\'prior_question_elapsed_time\', 1),\n                      ]\n    behavior_feature_list = [ "bundle_id", \'user_id\'] \n    #prin

In [None]:
fixlen_feature_columns = [SparseFeat(feat, X[feat].nunique())
                          for feat in cat_columns] + [DenseFeat(feat, 1, )
                                                          for feat in cont_columns]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model

X_train,X_valid,y_train,y_valid = train_test_split(X[features],y,test_size=0.15)
#train, test = train_test_split(X[features], test_size=0.2, random_state=2020)
train_model_input = {name: X_train[name] for name in feature_names}
test_model_input = {name: X_valid[name] for name in feature_names}

device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
               task='binary',
               l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

model.fit(train_model_input, y_train.values, batch_size=32, epochs=10, verbose=2, validation_split=0.2)

pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(y_valid.values, pred_ans), 4))
print("test AUC", round(roc_auc_score(y_valid.values, pred_ans), 4))

cuda ready...
cuda:0
Train on 66763 samples, validate on 16691 samples, 2087 steps per epoch
Epoch 1/10
10s - loss:  0.4961 - binary_crossentropy:  0.4961 - auc:  0.7958 - val_binary_crossentropy:  0.4907 - val_auc:  0.7986
Epoch 2/10
10s - loss:  0.4853 - binary_crossentropy:  0.4853 - auc:  0.8039 - val_binary_crossentropy:  0.4896 - val_auc:  0.7983
Epoch 3/10
10s - loss:  0.4813 - binary_crossentropy:  0.4813 - auc:  0.8080 - val_binary_crossentropy:  0.4928 - val_auc:  0.7951
Epoch 4/10
10s - loss:  0.4757 - binary_crossentropy:  0.4758 - auc:  0.8103 - val_binary_crossentropy:  0.5029 - val_auc:  0.7872
Epoch 5/10
10s - loss:  0.4665 - binary_crossentropy:  0.4666 - auc:  0.8179 - val_binary_crossentropy:  0.5239 - val_auc:  0.7733
Epoch 6/10
10s - loss:  0.4546 - binary_crossentropy:  0.4545 - auc:  0.8271 - val_binary_crossentropy:  0.5505 - val_auc:  0.7602
Epoch 7/10
10s - loss:  0.4422 - binary_crossentropy:  0.4422 - auc:  0.8365 - val_binary_crossentropy:  0.5779 - val_auc