In [85]:
import pandas as pd

import random
import json

import os

In [86]:
core = 10
path_train_read = './KuaiRand/log_standard_4_08_to_4_21_1k.csv'
path_train = 'train_data.json'
path_all_data_pred = 'train_data_pred.json'
path_test_read = './KuaiRand/log_standard_4_22_to_5_08_1k.csv'
path_test = 'test_data.json'
path_validation = 'validation_data.json'

In [87]:
# train data func
def dataset_filtering(interaction, core):
    # filter the cold users and items within 10 interactions
    user_id_dic = {}  # record the number of interaction for each user and item
    item_id_dic = {}
    for(user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        try: user_id_dic[user_id] += 1
        except: user_id_dic[user_id] = 1
        try: item_id_dic[item_id] += 1
        except: item_id_dic[item_id] = 1
    print('# Original training dataset')
    print('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    sort_user = []
    sort_item = []
    for user_id in user_id_dic:
        sort_user.append((user_id, user_id_dic[user_id]))
    for item_id in item_id_dic:
        sort_item.append((item_id, item_id_dic[item_id]))
    
    sort_user.sort(key=lambda x: x[1])
    sort_item.sort(key=lambda x: x[1])
    print('Fitering (core = ' + str(core) + ') ... ', end = 'number of remained interactions: ')
    
    while sort_user[0][1] < core or sort_item[0][1] < core:
        # find out all users and items with less than core recorders
        user_LessThanCore = set()
        item_LessThanCore = set()
        for pair in sort_user:
            if pair[1] < core: user_LessThanCore.add(pair[0])
            else: break
        for pair in sort_item:
            if pair[1] < core: item_LessThanCore.add(pair[0])
            else: break
        # reconstruct the interaction record, remove the cool one
        interaction_filtered = []
        for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
            if not (user_id in user_LessThanCore or item_id in item_LessThanCore):
                interaction_filtered.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview))
        # update the record
        interaction = interaction_filtered
        # count the number of each user and item in new data, check if all cool users and items are removed
        # reset all memory variables
        user_id_dic = {}  # record the number of interaction for each user and item
        item_id_dic = {}
        for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
            try: user_id_dic[user_id] += 1
            except: user_id_dic[user_id] = 1
            try: item_id_dic[item_id] += 1
            except: item_id_dic[item_id] = 1

        sort_user = []
        sort_item = []
        for user_id in user_id_dic:
            sort_user.append((user_id, user_id_dic[user_id]))
        for item_id in item_id_dic:
            sort_item.append((item_id, item_id_dic[item_id]))
        sort_user.sort(key=lambda x: x[1])
        sort_item.sort(key=lambda x: x[1])
        print (len(interaction), end = ' ')
    print()
    print ('# Filtered training dataset')
    print ('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    
    return interaction

In [88]:
# train data func
def index_encoding(interaction):
    # mapping id into number
    # after filtering the dataset, we need to re-encode the index of users and items
    user_id_set = set()
    item_id_set = set()

    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        user_id_set.add(user_id)
        item_id_set.add(item_id)
    user_num2id = list(user_id_set)
    item_num2id = list(item_id_set)
    user_num2id.sort()
    item_num2id.sort()
    # user_id2num maps id to number, and user_num2id dictionary is not needed, user_ID
    user_id2num = {}
    for num in range(0, len(user_id_set)):
        user_id2num[user_num2id[num]] = num
    item_id2num = {}
    for num in range(0, len(item_id_set)):
        item_id2num[item_num2id[num]] = num
    interaction_number = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        interaction_number.append((user_id2num[user_id], item_id2num[item_id], time_ms, click, like, follow, comment, forward, longview))
    interaction = interaction_number
    return interaction, user_id2num, item_id2num

In [89]:
# train & validation & test func
def user_action_list_making(Interaction_train, user_id2num):
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        user_action_list[user_id].append((item_id, time_ms));
        
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        history = user_action_list[user_id] # [(item_id, time_ms), ]
        history.sort(key=lambda x: x[1])     # time_stamp rank, small->big
        
        user_real_action = []
        for (p1, _) in history :
            if (p1 == item_id):
                break
            else:
                user_real_action.append(p1)
        
        Interaction_train_with_action.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action))
    return Interaction_train_with_action

In [90]:
# train & validation & test func
# click action_list; non-click sample's action filter with 'if (time_ms < ts)'
def user_action_list_making_with_all_sample(Interaction_train, user_id2num, is_click = True):
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        if is_click:
            if click == 1:
                user_action_list[user_id].append((item_id, time_ms))
        else:
            user_action_list[user_id].append((item_id, time_ms))
    print("len(user_action_list) = ", len(user_action_list))
    # print("user_action_list[:3] = ", user_action_list[:3])
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        history = user_action_list[user_id] # [(item_id, time_ms), ]
        history.sort(key=lambda x: x[1])     # time_stamp rank, small->big
        
        user_real_action = []
        for (p1, ts) in history :
            if (time_ms < ts):
                user_real_action.append(p1)
            else:
                break
        Interaction_train_with_action.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action))
    return Interaction_train_with_action

In [91]:
# validation & test func
def filter_test_or_validation_data(data, user_id2num, item_id2num):
    # filter test && validation data
    new_data = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in data:
        if user_id in user_id2num and item_id in item_id2num:
            new_data.append((user_id2num[user_id], item_id2num[item_id], time_ms, click, like, follow , comment, forward, longview))
    return new_data

# write json
def write_data(path, data):
    f = open(path, 'w')
    jsObj = json.dumps(data)
    f.write(jsObj)
    f.close()

In [92]:
# 1 load data
data_train = []
# origin = pd.read_csv(path_train_read)
cols = ['user_id', 'video_id', 'tab', 'time_ms',
        'is_click', 'is_like', 'is_follow', 'is_comment', 'is_forward', 'long_view']
raw_data_train = pd.read_csv(path_train_read, usecols=cols)
raw_data_train.tail()

Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
5055979,999,694615,1650552207405,0,0,0,0,0,0,1
5055980,999,1240364,1650552207405,0,0,0,0,0,0,1
5055981,999,2514654,1650552207405,1,0,0,0,0,1,1
5055982,999,4214495,1650552339920,0,0,0,0,0,0,1
5055983,999,2897178,1650552339920,1,0,0,0,0,1,1


In [93]:
# 2 click sample
raw_data_train_with_click = raw_data_train[(raw_data_train.is_click ==1)]
raw_data_train_with_click.head(10)

Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
4,0,2528540,1649467982289,1,0,0,0,0,0,0
23,0,4067506,1649477390308,1,0,0,0,0,1,1
37,0,3556496,1649673423739,1,0,0,0,0,1,0
126,0,909,1649673704264,1,0,0,0,0,1,1
131,0,2407158,1649673704264,1,0,0,0,0,0,1
158,0,434186,1649675512388,1,0,0,0,0,1,1
213,0,420001,1649676359584,1,0,0,0,0,0,0
254,0,3998230,1649676395781,1,0,0,0,0,0,0
292,0,3322810,1649793902235,1,0,0,0,0,0,1
296,0,182968,1649793923341,1,0,0,0,0,1,1


In [94]:
# 3 add list
train_data_click = []
row = len(raw_data_train_with_click['user_id'])
print("row=", row)
          
for index, row in raw_data_train_with_click.iterrows():
    train_data_click.append((row['user_id'], row['video_id'], row['time_ms'], int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))

row= 1917934


In [95]:
# 4 train data filter: frequence
train_data_click = dataset_filtering(train_data_click, core)

# Original training dataset
  User: 983 Item: 877897 Interaction: 1917934 Sparsity: 99.77775265814853 %
Fitering (core = 10) ... number of remained interactions: 474721 474623 474515 
# Filtered training dataset
  User: 960 Item: 25701 Interaction: 474515 Sparsity: 98.07678122122356 %


In [96]:
# 5 train data index_encoding
train_data_click, user_id2num, item_id2num = index_encoding(train_data_click)
print(train_data_click[0:10])

[(0, 52, 1649673704264, 1, 0, 0, 0, 0, 1), (0, 14331, 1649673704264, 1, 0, 0, 0, 0, 0), (0, 12833, 1649936855141, 1, 0, 0, 0, 0, 1), (0, 17144, 1649936894064, 1, 0, 0, 0, 0, 0), (0, 845, 1649936975456, 1, 0, 0, 0, 0, 1), (0, 7479, 1649937088780, 1, 0, 0, 0, 0, 1), (0, 17137, 1649937088780, 1, 0, 0, 0, 0, 1), (0, 17041, 1650330704859, 1, 0, 0, 0, 0, 1), (0, 15726, 1650338460613, 1, 0, 0, 0, 0, 1), (0, 12890, 1650338617363, 1, 0, 0, 0, 0, 1)]


In [97]:
# 6 make action_list
train_data_click_with_action = user_action_list_making(train_data_click, user_id2num)
print(train_data_click_with_action[0:10])

user_action_list init success, waiting filter...
[(0, 52, 1649673704264, 1, 0, 0, 0, 0, 1, []), (0, 14331, 1649673704264, 1, 0, 0, 0, 0, 0, [52]), (0, 12833, 1649936855141, 1, 0, 0, 0, 0, 1, [52, 14331]), (0, 17144, 1649936894064, 1, 0, 0, 0, 0, 0, [52, 14331, 12833]), (0, 845, 1649936975456, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144]), (0, 7479, 1649937088780, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845]), (0, 17137, 1649937088780, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479]), (0, 17041, 1650330704859, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137]), (0, 15726, 1650338460613, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041]), (0, 12890, 1650338617363, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726])]


In [98]:
# 7 fianl train data: del time_ms
final_data_train = []
for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in train_data_click_with_action:
    final_data_train.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))
print(final_data_train[0:10])

[(0, 52, 1, 0, 0, 0, 0, 1, []), (0, 14331, 1, 0, 0, 0, 0, 0, [52]), (0, 12833, 1, 0, 0, 0, 0, 1, [52, 14331]), (0, 17144, 1, 0, 0, 0, 0, 0, [52, 14331, 12833]), (0, 845, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144]), (0, 7479, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845]), (0, 17137, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479]), (0, 17041, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137]), (0, 15726, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041]), (0, 12890, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726])]


In [99]:
print("fianl tain data, row", len(final_data_train))

fianl tain data, row 474515


In [100]:
# 8 wirte train data
write_data(path_train, final_data_train)

In [101]:
# 9 other 
with open(path_train) as f:
    line = f.readline()
    data = json.loads(line)
    f.close()
    user_num = len(data)
print("confirm data, num=",user_num)
data[-5:]

confirm data, num= 474515


[[959,
  24812,
  1,
  0,
  0,
  0,
  0,
  0,
  [23088,
   8583,
   24220,
   16989,
   2454,
   12390,
   1579,
   2050,
   23887,
   1538,
   339,
   3156,
   6683,
   20744,
   24199,
   4758,
   1797,
   19195,
   22781,
   13372,
   8141,
   7833,
   17804,
   13398,
   2041,
   22997,
   20261,
   23589,
   14925,
   453,
   17360,
   9149,
   8667,
   4846,
   23779,
   4284,
   25473,
   4067,
   3744,
   197,
   8995,
   18793,
   492,
   22158,
   10499,
   8024,
   19233,
   3457,
   23415,
   14064,
   20100,
   8820,
   11323,
   4789,
   1168,
   5364,
   20938,
   6194,
   9641,
   18078,
   15558,
   24674,
   7922,
   4565,
   16419,
   21326,
   21652,
   2316,
   25478,
   18933,
   20344,
   3148,
   10620,
   661,
   22984,
   15641,
   22514,
   2847,
   19800,
   5022,
   17725,
   15021,
   7186,
   9381,
   3058,
   6973,
   21784,
   9071,
   15030,
   17929,
   3699,
   16331,
   12567,
   22606,
   25038,
   1090,
   24230,
   24408,
   18034,
   7115,
   25

In [102]:
# 10 other: distribution
# count user action_list length distribution
dic_action_list_length = {}
for (user_id, item_id, click, like, follow, comment, forward, longview, user_real_action) in final_data_train:
    length = len(user_real_action)
    try: dic_action_list_length[length] += 1
    except: dic_action_list_length[length] = 1

for key, value in dic_action_list_length.items():
    print ("length=", key, "number=", value)
sorted(dic_action_list_length)

length= 0 number= 978
length= 1 number= 969
length= 2 number= 962
length= 3 number= 960
length= 4 number= 964
length= 5 number= 958
length= 6 number= 964
length= 7 number= 963
length= 8 number= 954
length= 9 number= 958
length= 10 number= 960
length= 11 number= 954
length= 12 number= 954
length= 13 number= 960
length= 14 number= 951
length= 15 number= 953
length= 16 number= 951
length= 17 number= 958
length= 18 number= 946
length= 19 number= 955
length= 20 number= 943
length= 21 number= 944
length= 22 number= 946
length= 23 number= 940
length= 24 number= 942
length= 25 number= 936
length= 26 number= 930
length= 27 number= 933
length= 28 number= 928
length= 29 number= 933
length= 30 number= 928
length= 31 number= 933
length= 32 number= 925
length= 33 number= 927
length= 34 number= 926
length= 35 number= 921
length= 36 number= 921
length= 37 number= 920
length= 38 number= 912
length= 39 number= 912
length= 40 number= 908
length= 41 number= 910
length= 42 number= 910
length= 43 number= 90

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [103]:
x1 = []
y1 = []
for key, value in dic_action_list_length.items():
    x1.append(key)
    y1.append(value)
len(x1)
len(y1)

2703

In [104]:
import plotly.offline as py                    #保存图表，相当于plotly.plotly as py，同时增加了离线功能
py.init_notebook_mode(connected=True)          #离线绘图时，需要额外进行初始化
import plotly.graph_objects as go # 引入plotly底层绘图库

trace = go.Scatter(
     x = x1,
     y = y1,
     showlegend=True
)
data = [trace]

py.iplot(data)

In [105]:
dic_table = {}
for key, value in dic_action_list_length.items():
    if (key <= 100):
        try:dic_table['0-100'] += value;
        except: dic_table['0-100'] = value
    elif (key <= 200):
        try:dic_table['100-200'] += value;
        except: dic_table['100-200'] = value
    elif (key <= 300):
        try:dic_table['200-300'] += value;
        except: dic_table['200-300'] = value
    elif (key <= 400):
        try:dic_table['300-400'] += value;
        except: dic_table['300-400'] = value
    elif (key <= 500):
        try:dic_table['400-500'] += value;
        except: dic_table['400-500'] = value
    elif (key <= 1000):
        try:dic_table['500-1000'] += value;
        except: dic_table['500-1000'] = value
    elif (key <= 2000):
        try:dic_table['1000-2000'] += value;
        except: dic_table['1000-2000'] = value
    elif (key <= 2500):
        try:dic_table['2000-2500'] += value;
        except: dic_table['2000-2500'] = value
    elif (key <= 3000):
        try:dic_table['2500-3000'] += value;
        except: dic_table['2500-3000'] = value

for key, value in dic_table.items():
    print("ation_list length:", key, "log num:", value)

ation_list length: 0-100 log num: 90524
ation_list length: 100-200 log num: 75354
ation_list length: 200-300 log num: 61435
ation_list length: 300-400 log num: 50755
ation_list length: 400-500 log num: 40718
ation_list length: 500-1000 log num: 107543
ation_list length: 1000-2000 log num: 45432
ation_list length: 2000-2500 log num: 2420
ation_list length: 2500-3000 log num: 334


In [111]:
# 1 click 1 & 0
# 2 get data
all_data = raw_data_train
print("len(all_data) = ", len(all_data))
all_data.head()

len(all_data) =  5055984


Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
0,0,4354972,1649467982289,0,0,0,0,0,0,0
1,0,1329429,1649467982289,0,0,0,0,0,0,0
2,0,346081,1649467982289,0,0,0,0,0,0,0
3,0,2058916,1649467982289,0,0,0,0,0,0,0
4,0,2528540,1649467982289,1,0,0,0,0,0,0


In [112]:
# 3 add list  
# && filter data with user_id2num & item_id2num  
# && replace with user_id2num[row['user_id']]
train_data_all = []
row = len(all_data['user_id'])
print("row =", row)

print("prepare generating train_data_all....")
set_user_id = set()
set_item_id = set()
for key, _ in user_id2num.items():
    set_user_id.add(key)
for key, _ in item_id2num.items():
    set_item_id.add(key)

print("len(set_user_id) = ", len(set_user_id), ", len(set_item_id) = ", len(set_item_id))

for index, row in all_data.iterrows():
    if row['user_id'] in set_user_id and row['video_id'] in set_item_id:
        train_data_all.append((user_id2num[row['user_id']], item_id2num[row['video_id']], row['time_ms'], int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))
print ("len(train_data_all) = ", len(train_data_all))
print ("train_data_all[0:5] = ", train_data_all[0:5])

row = 5055984
prepare generating train_data_all....
len(train_data_all) =  843819
train_data_all[0:5] =  [(0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0), (0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0), (0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0), (0, 21144, 1649673604040, 0, 0, 0, 0, 0, 0), (0, 465, 1649673604040, 0, 0, 0, 0, 0, 0)]


In [113]:
# 4 5
# 6 make action_list
print("len(user_id2num) = ", len(user_id2num))
train_data_all_with_action = user_action_list_making_with_all_sample(train_data_all, user_id2num, is_click = True)
print(train_data_all_with_action[0:10])

len(user_id2num) =  960
len(user_action_list) =  960
user_action_list init success, waiting filter...
[(0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 20557, 25195, 10150, 25261, 7864, 7186, 9559, 19792, 15069, 17692, 2285, 10558, 13741]), (0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 20557, 25195, 10150, 25261, 7864, 7186, 9559, 19792, 15069, 17692, 2285, 10558, 13741]), (0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 

In [114]:
# 7 fianl train data: del time_ms
final_all_data_pred = []
for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in train_data_all_with_action:
    final_all_data_pred.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))
print(final_all_data_pred[0:5])

[(0, 25588, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 20557, 25195, 10150, 25261, 7864, 7186, 9559, 19792, 15069, 17692, 2285, 10558, 13741]), (0, 7695, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 20557, 25195, 10150, 25261, 7864, 7186, 9559, 19792, 15069, 17692, 2285, 10558, 13741]), (0, 17787, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726, 12890, 19735, 12100, 6653, 1115, 6087, 11201, 18722, 22934, 12704, 7560, 8437, 9117, 12001, 23885, 6613, 16527, 2505, 20550, 18240, 7065, 20557, 25195, 10150, 25261, 7864, 7186, 9559, 19792, 15069, 17692, 2285, 10558, 13741]), (0, 21144, 0, 0, 0, 0, 0, 0, [52, 14331, 12833, 17144, 845

In [115]:
print("final_all_data_pred, row", len(final_all_data_pred))

final_all_data_pred, row 843819


In [121]:
# 8 wirte train data
write_data(path_all_data_pred, final_all_data_pred)

In [122]:
# 9 other 
with open(path_all_data_pred) as f:
    line = f.readline()
    data_pred = json.loads(line)
    f.close()
    user_num = len(data_pred)
print("confirm data, num=",user_num)
data_pred[0:5]

confirm data, num= 843819


[[0,
  25588,
  0,
  0,
  0,
  0,
  0,
  0,
  [52,
   14331,
   12833,
   17144,
   845,
   7479,
   17137,
   17041,
   15726,
   12890,
   19735,
   12100,
   6653,
   1115,
   6087,
   11201,
   18722,
   22934,
   12704,
   7560,
   8437,
   9117,
   12001,
   23885,
   6613,
   16527,
   2505,
   20550,
   18240,
   7065,
   20557,
   25195,
   10150,
   25261,
   7864,
   7186,
   9559,
   19792,
   15069,
   17692,
   2285,
   10558,
   13741]],
 [0,
  7695,
  0,
  0,
  0,
  0,
  0,
  0,
  [52,
   14331,
   12833,
   17144,
   845,
   7479,
   17137,
   17041,
   15726,
   12890,
   19735,
   12100,
   6653,
   1115,
   6087,
   11201,
   18722,
   22934,
   12704,
   7560,
   8437,
   9117,
   12001,
   23885,
   6613,
   16527,
   2505,
   20550,
   18240,
   7065,
   20557,
   25195,
   10150,
   25261,
   7864,
   7186,
   9559,
   19792,
   15069,
   17692,
   2285,
   10558,
   13741]],
 [0,
  17787,
  0,
  0,
  0,
  0,
  0,
  0,
  [52,
   14331,
   12833,
   17144,
   845