In [None]:
# Copyright 2021 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" 

import matplotlib.pyplot as plt, time
import tensorflow as tf
import cudf, numpy as np, pandas as pd
pd.set_option('display.max_columns', 100)
import cupy, gc
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
tf.__version__

'2.3.0'

In [2]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [3]:
#tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
#print('Mixed precision enabled')

In [4]:
VER = 86
MODEL_NUM = 1

In [5]:
# ALL OF GIBA PREPROCESS FEATURES
ALL = ['hashtags', 'tweet_id', 'media', 'links', 'domains', 'tweet_type',
       'language', 'timestamp', 'a_user_id', 'a_follower_count',
       'a_following_count', 'a_is_verified', 'a_account_creation', 'b_user_id',
       'b_follower_count', 'b_following_count', 'b_is_verified',
       'b_account_creation', 'b_follows_a', 'reply', 'retweet',
       'retweet_comment', 'like', 'text', 'tw_len_media', 'tw_len_photo',
       'tw_len_video', 'tw_len_gif', 'tw_len_quest', 'tw_len_token',
       'tw_count_capital_words', 'tw_count_excl_quest_marks',
       'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 'tw_len_retweet',
       'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
       'tw_rt_count_words', 'tw_rt_count_char', 'tw_original_user0',
       'tw_original_user1', 'tw_original_user2', 'tw_rt_user0',
       'tw_original_http0', 'tw_word0', 'tw_word1', 'tw_word2', 'tw_word3',
       'tw_word4', 'tw_tweet', 'group', 'dt_day', 'dt_dow', 'dt_minute',
       'len_hashtags', 'len_links', 'len_domains']

# Features

In [6]:
FEATS = ['b_follows_a','a_follower_count','a_following_count','b_follower_count','b_following_count'] #,
#         'a_account_creation','b_account_creation',
#         'a_is_verified','b_is_verified']

FEATS += ['dt_dow', 'dt_minute',
          'len_hashtags', 'len_links', 'len_domains']

FEATS += ['tw_len_media', 'tw_len_photo','tw_len_video', 'tw_len_gif', 
          'tw_len_quest', 'tw_len_token',
          'tw_count_capital_words', 'tw_count_excl_quest_marks',
          'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 
          'tw_len_retweet', 'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
          'tw_rt_count_words', 'tw_rt_count_char']

FEATS2 = ['a_user_id', 'b_user_id','tweet_type','language','media']

In [7]:
TARS = ['reply', 'retweet','retweet_comment', 'like']
COLS = TARS + FEATS2 + FEATS 

# Build NN

In [8]:
EMB_SIZE = 96
EMB_SIZE2 = 96*2
TOK_SIZE = 48

def build_model():
    inp = tf.keras.layers.Input(shape=(len(FEATURES),))
    inp2 = tf.keras.layers.Input(shape=(TOK_SIZE,))

    embeds = []
    embeds.append( tf.keras.layers.Embedding(len(IDX)+1,EMB_SIZE) ) # USER_ID
    
    embeds.append( tf.keras.layers.Embedding(3,2) ) # TWEET_TYPE
    embeds.append( tf.keras.layers.Embedding(66,10) ) # LANGUAGE
    embeds.append( tf.keras.layers.Embedding(14,4) ) # MEDIA
    embeds.append( tf.keras.layers.Embedding(7,4) ) # DT_DOW
       
    # USERS
    a_user = embeds[0](inp[:,0])
    b_user = embeds[0](inp[:,1])
    
    # USER INTERACTION
    a_embed = tf.keras.layers.Concatenate()([a_user,inp[:,-len(NORM_FEATS):-len(NORM_FEATS)+2],inp[:,-1:]])
    a_embed = tf.keras.layers.Dense(EMB_SIZE,activation='tanh')(a_embed)
    a_dot_b = tf.keras.layers.Dot(axes=-1,normalize=True)([a_embed,b_user])
        
    # CAT FEATURE EMBEDDINGS
    embeds2 = []    
    for k in range(2,len(CAT_FEATS)):
        embeds2.append( embeds[k-1](inp[:,k]) )
    x1 = tf.keras.layers.Concatenate()(embeds2)
        
    # TWEET TOKEN EMBEDDINGS
    embeds3 = []
    word_emb = tf.keras.layers.Embedding(119548,EMB_SIZE2)
    for k in range(TOK_SIZE):
        embeds3.append( word_emb(inp2[:,k]) )
    x2 = tf.keras.layers.Average()(embeds3)
    
    # USER INTERACT WITH TWEET
    tweet_embed = tf.keras.layers.Concatenate()([x1,x2,inp[:,-len(NORM_FEATS):-3]])
    tweet_embed = tf.keras.layers.Dense(EMB_SIZE,activation='tanh')(tweet_embed)
    b_dot_tweet = tf.keras.layers.Dot(axes=-1,normalize=True)([tweet_embed,b_user])

    # NUMERICAL FEATURES
    x = tf.keras.layers.Concatenate()(
        [a_user,b_user,a_dot_b,b_dot_tweet,x1,x2,inp[:,-len(NORM_FEATS):]])
    
    HIDDEN_SIZE = 256+64
    LAYERS = 3
    
    for k in range(LAYERS):
        x = tf.keras.layers.Dense(HIDDEN_SIZE)(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation('relu')(x)
        
    # CONCAT WITH MATRIX FACTORIZATION
    x = tf.keras.layers.Concatenate()([a_dot_b,b_dot_tweet,x])
    
    x = tf.keras.layers.Dense(4,activation='sigmoid',dtype='float32')(x)
    model = tf.keras.models.Model(inputs=[inp,inp2],outputs=x)
    
    opt = tf.keras.optimizers.Adam(lr=1e-3)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    return model

In [9]:
FEATURES = ['a_user_id', 'b_user_id', 'tweet_type', 'language', 'media',
       'dt_dow', 'NORM_LOG_a_follower_count',
       'NORM_LOG_a_following_count', 'NORM_dt_minute',
       'NORM_len_hashtags', 'NORM_len_links', 'NORM_len_domains',
       'NORM_tw_len_media', 'NORM_tw_len_photo', 'NORM_tw_len_video',
       'NORM_tw_len_gif', 'NORM_tw_len_quest', 'NORM_tw_len_token',
       'NORM_tw_count_capital_words', 'NORM_tw_count_excl_quest_marks',
       'NORM_tw_count_special1', 'NORM_tw_count_hash',
       'NORM_tw_last_quest', 'NORM_tw_len_retweet', 'NORM_tw_len_rt',
       'NORM_tw_count_at', 'NORM_tw_count_words', 'NORM_tw_count_char',
       'NORM_tw_rt_count_words', 'NORM_tw_rt_count_char',
       'NORM_LOG_b_follower_count', 'NORM_LOG_b_following_count',
       'NORM_b_follows_a']
CAT_FEATS = [f for f in FEATURES if not 'NORM' in f]
NORM_FEATS = [f for f in FEATURES if 'NORM' in f]
IDX = np.load(f'../jun-5-2021-RECSYS/group{MODEL_NUM}.npy')

In [10]:
with strategy.scope():
    model = build_model()
    
model.load_weights('nn%i.h5'%85)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

In [11]:
parts2 = np.arange(218)
np.random.shuffle(parts2)
print(len(parts2))
parts2

218


array([ 19, 144, 177, 130,  26, 153, 195,  56, 207, 192, 137,  48, 173,
        59, 122, 187,  85,  27,  73, 108,  81,  89, 151, 157,  58,  36,
        10, 141, 119,   9,  13, 154,  71, 135, 194,  80,  21, 212,  50,
        70, 203, 176, 156, 148, 186,   0,  47, 160, 181, 106, 172,  97,
        11, 103,  88,  15, 149,  45, 158,  46,  52, 124,   8, 120, 175,
       128,  42, 188,   5,  67, 216,   3, 121,  96, 113, 184, 211, 166,
       115,  83, 155, 146, 109,   4, 214, 197,  93, 183,  72, 164, 178,
        91,  34, 116, 170,  12,  51,  17, 101, 190,  60, 127,  39, 145,
        53, 105,  22, 213,  62,  75, 165,  33, 138,  20,  84, 204, 210,
       111, 107, 174,  69, 191, 100,  99, 112, 133, 206,  61,  44, 123,
         6,  55,  86, 179, 152,  63, 202, 196, 147,  76, 117,  23,  78,
         2, 205, 150,  57, 132,  54,  32, 143, 168, 215,  25, 198,  31,
        30, 102,  74, 134,  14,  68,  64,  98,   1,  40, 129,  79, 199,
        37,  43, 180, 104, 189, 169,  28, 136,  95, 209, 118,  1

In [12]:
np.save(f'part{MODEL_NUM}_c',parts2)

# Train NN

In [None]:
for PART_NUM in range(15):
    print('#'*25)
    print('### PART',PART_NUM)
    print('#'*25)
    
    f = open(f'log_{VER}.txt','a')
    f.write('#############################\n')
    f.write(f'### PART {PART_NUM}\n')
    #f.write('#############################\n')
    f.close()

    # LOAD TRAIN FEATURES
    data = []
    parts = parts2
    
    CT = min(15, len(parts2) - PART_NUM*15)
    
    for x in range(CT):
        name = 'part-%.5i.parquet'%parts[PART_NUM*15+x]
        df = pd.read_parquet(f'/raid/RecSys/recsys2021/parquet7/'+name,columns=COLS)
        data.append(df)
        print( df.shape,', ',end='')
    
    train = pd.concat(data,axis=0,ignore_index=True)
    del df, data
    gc.collect()
    print( train.shape ,', ',end='')

    # LOAD TRAIN TOKENS
    data = []
    for x in range(CT):
        name = 'part-%.5i.npy'%parts[PART_NUM*15+x]
        df = np.load(f'/raid/RecSys/recsys2021/tweet_tokens/'+name)[:,:TOK_SIZE]
        data.append(df)
        print( df.shape ,', ',end='')
    
    train_tokens = np.concatenate(data,axis=0)
    del df, data
    gc.collect()
    print( train_tokens.shape ,', ',end='')

    # LOG PROCESS
    TARGETS = ['reply','retweet','retweet_comment','like']
    LOG_FEATS = ['a_follower_count','a_following_count','b_follower_count','b_following_count']
    NUM_FEATS1 = []
    for f in LOG_FEATS:
        name = 'LOG_'+f
        print(name,', ',end='')
        train[name] = np.log1p( train[f].values, dtype='float32' )
        #valid[name] = np.log1p( valid[f].values, dtype='float32' )
        NUM_FEATS1.append( name )
        del train[f]
    gc.collect()

    # NORM PROCESS
    NORM_FEATS = ['LOG_a_follower_count','LOG_a_following_count',
              'dt_minute','len_hashtags','len_links','len_domains']
              #'a_account_creation','b_account_creation','a_is_verified','b_is_verified']

    NORM_FEATS += ['tw_len_media', 'tw_len_photo','tw_len_video', 'tw_len_gif', 
          'tw_len_quest', 'tw_len_token',
          'tw_count_capital_words', 'tw_count_excl_quest_marks',
          'tw_count_special1', 'tw_count_hash', 'tw_last_quest', 
          'tw_len_retweet', 'tw_len_rt', 'tw_count_at', 'tw_count_words', 'tw_count_char',
          'tw_rt_count_words', 'tw_rt_count_char']

    NORM_FEATS += ['LOG_b_follower_count','LOG_b_following_count','b_follows_a']

    data = pd.read_csv('../jun-5-2021-RECSYS/standardize_ALL.csv').set_index('feature')

    NUM_FEATS2 = []
    for f in NORM_FEATS:
        name = 'NORM_'+f
        print(name,', ',end='')
        mn = data.loc[f,'mean']
        st = data.loc[f,'std']
        train[name] = ((train[f].values - mn) /st).astype('float32')
        NUM_FEATS2.append( name )
        del train[f]
    
    gc.collect()

    if PART_NUM==0:
        data = data.reset_index()
        data.to_csv('standardize_%i.csv'%VER,index=False)
        data.head()

    # USER PROCESS
    FILTER = 5
    IDX = np.load(f'../jun-5-2021-RECSYS/group{MODEL_NUM}.npy')
    print('users',len(IDX))
    print( IDX[:5] )
    user_map = {x:y for x,y in zip(IDX,1+np.arange(len(IDX)))}
    if PART_NUM==0:
        np.save('user_map_%i'%VER,IDX)
    
    train['a_user_id'] = train.a_user_id.map(user_map).fillna(0).astype('int32')
    train['b_user_id'] = train.b_user_id.map(user_map).fillna(0).astype('int32')
    CAT_FEATS = ['a_user_id','b_user_id','tweet_type','language','media','dt_dow']
    #for f in CAT_FEATS:
    #    m = train[f].max()
    #    print(f,m)
    
    FEATURES = CAT_FEATS + NUM_FEATS2
    #print( np.asarray( FEATURES ) )

    # SHUFFLE TRAIN
    #ID = np.arange(len(train))
    #np.random.shuffle(ID)
    #train = train.iloc[ID].reset_index(drop=True)
    #train_tokens = train_tokens[ID,]

    hh = model.fit([train[FEATURES],train_tokens],train[TARGETS],
          #validation_data = ([valid[FEATURES],valid_tokens],valid[TARGETS]),
          epochs=1, verbose=1, batch_size=1024*8)
    
    lss = hh.history['loss'][0]
    f = open(f'log_{VER}.txt','a')
    f.write(f'### LOSS {lss}\n')
    f.write('#############################\n')
    f.close()
    
    del train, train_tokens
    gc.collect()

#########################
### PART 0
#########################
(3256402, 37) , (2990814, 37) , (2992279, 37) , (2991089, 37) , (3242655, 37) , (2990724, 37) , (2991761, 37) , (3178602, 37) , (2874007, 37) , (2989905, 37) , (2991578, 37) , (3199173, 37) , (2991379, 37) , (3167030, 37) , (2991787, 37) , (45839185, 37) , (3256402, 48) , (2990814, 48) , (2992279, 48) , (2991089, 48) , (3242655, 48) , (2990724, 48) , (2991761, 48) , (3178602, 48) , (2874007, 48) , (2989905, 48) , (2991578, 48) , (3199173, 48) , (2991379, 48) , (3167030, 48) , (2991787, 48) , (45839185, 48) , LOG_a_follower_count , LOG_a_following_count , LOG_b_follower_count , LOG_b_following_count , NORM_LOG_a_follower_count , NORM_LOG_a_following_count , NORM_dt_minute , NORM_len_hashtags , NORM_len_links , NORM_len_domains , NORM_tw_len_media , NORM_tw_len_photo , NORM_tw_len_video , NORM_tw_len_gif , NORM_tw_len_quest , NORM_tw_len_token , NORM_tw_count_capital_words , NORM_tw_count_excl_quest_marks , NORM_tw_count_spec

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [None]:
model.save_weights('nn%i.h5'%VER)