# Converting Data to NeuMF format

In [1]:
import pandas as pd
import numpy as np

# read question, answers, and users
questions = pd.read_csv('Data/Questions.csv')
answers = pd.read_csv('Data/Answers.csv')
users = pd.read_csv('Data/Users.csv')

In [2]:
# find answerers of each question
question_userid = pd.merge(questions, answers[['parentid', 'owneruserid']],
                           left_on='id',
                           right_on='parentid')

In [3]:
# use only ids to consider implicit feedback
question_userid = question_userid[['id', 'owneruserid_y']]
question_userid.columns = ['question_id', 'answerer_id']

users_id = users[['id']]

# add label for this related user-question
question_userid['rating'] = 3

In [4]:
temp = question_userid.sort_values(by=['answerer_id','question_id'])
key_answerer_id = temp['answerer_id'].unique()
key_question_id = temp['question_id'].unique()

key_answerer_id = pd.DataFrame(data=key_answerer_id).reset_index()
key_answerer_id.columns = ['index_u', 'answerer_id']

key_question_id = pd.DataFrame(data=key_question_id).reset_index()
key_question_id.columns = ['index_q', 'question_id']

key_interaction = pd.merge(temp, key_answerer_id, on='answerer_id')
key_interaction = pd.merge(key_interaction, key_question_id, on='question_id')

interaction = key_interaction[['index_q', 'index_u', 'rating']]
interaction

Unnamed: 0,index_q,index_u,rating
0,0,0,3
1,1,0,3
2,2,0,3
3,3,0,3
4,4,0,3
...,...,...,...
392793,392793,5447,3
392794,392794,5447,3
392795,392795,5447,3
392796,392796,5447,3


## Train-test split

In [5]:
# sample the question to train and test set
from sklearn.model_selection import train_test_split

tpropn = 0.05
train_question_userid, test_question_userid = train_test_split(interaction, test_size=tpropn)

In [6]:
test_question_userid.describe()

Unnamed: 0,index_q,index_u,rating
count,19640.0,19640.0,19640.0
mean,195688.63167,2592.080143,3.0
std,113814.897981,1572.370357,0.0
min,3.0,0.0,3.0
25%,97178.5,1213.0,3.0
50%,194924.0,2520.0,3.0
75%,294772.25,3929.0,3.0
max,392789.0,5447.0,3.0


In [7]:
from IPython.display import clear_output

# number of negative sampling
neg_sample_num = 99

colnames = ['key']
for i in range(neg_sample_num):
    text = 'neg'+str(i)
    colnames.append(text)

neg_samples = pd.DataFrame(columns=colnames)

k = 0
for question_user_id in test_question_userid.values:
    k += 1
    if k%400 == 0:
        clear_output(wait=True)
        print(k*100/19640,'%', '='*int(k*100/19640))
    rand_users = test_question_userid[['index_u']].sample(neg_sample_num).values

    key_tuple = (question_user_id[0], question_user_id[1])
    row = dict()
    for h in range(len(colnames)):
        if h == 0:
            row[colnames[h]] = key_tuple
        else:
            row[colnames[h]] = rand_users[h-1][0]
            
    neg_samples = neg_samples.append(row, ignore_index=True)
    
neg_samples



Unnamed: 0,key,neg0,neg1,neg2,neg3,neg4,neg5,neg6,neg7,neg8,...,neg89,neg90,neg91,neg92,neg93,neg94,neg95,neg96,neg97,neg98
0,"(78457, 1010)",4849,2419,2520,1823,470,1310,3008,4645,2411,...,4712,435,110,1840,4775,5174,1937,3025,4398,629
1,"(151525, 1982)",145,3986,3143,286,5415,3023,4110,4256,4216,...,2218,4874,2022,1257,4613,4004,1418,4723,165,1776
2,"(5187, 58)",3273,1099,3664,5076,5204,2268,1035,2035,3403,...,5425,4615,688,3229,2994,797,3686,3701,4309,3807
3,"(229375, 3058)",122,3761,2220,636,4859,3071,1048,5217,1048,...,3814,3648,2494,3531,2520,3299,4975,3841,259,4294
4,"(89651, 1105)",3329,3676,1168,2971,2894,264,979,598,2999,...,3318,4853,185,2984,5234,1473,119,2839,5443,2738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19635,"(293185, 3905)",2054,28,2336,2469,3929,5443,3704,4223,1575,...,1442,273,3431,5272,2901,4686,3563,1276,2411,3204
19636,"(20806, 222)",949,1318,4490,1189,3338,553,4982,785,3542,...,750,1108,108,1337,1512,4313,120,185,1885,4108
19637,"(122945, 1525)",308,2554,1048,1356,1271,4954,3600,729,1281,...,1886,5380,3550,4531,648,4968,1352,800,3597,1519
19638,"(388251, 5368)",2281,194,2379,348,1823,1574,598,2442,2251,...,2022,3662,2990,187,1679,155,1276,1783,1341,2468


In [8]:
test_question_userid.shape

(19640, 3)

In [9]:
neg_samples.head(1)

Unnamed: 0,key,neg0,neg1,neg2,neg3,neg4,neg5,neg6,neg7,neg8,...,neg89,neg90,neg91,neg92,neg93,neg94,neg95,neg96,neg97,neg98
0,"(78457, 1010)",4849,2419,2520,1823,470,1310,3008,4645,2411,...,4712,435,110,1840,4775,5174,1937,3025,4398,629


In [10]:
neg_samples.describe()

Unnamed: 0,key,neg0,neg1,neg2,neg3,neg4,neg5,neg6,neg7,neg8,...,neg89,neg90,neg91,neg92,neg93,neg94,neg95,neg96,neg97,neg98
count,19640,19640,19640,19640,19640,19640,19640,19640,19640,19640,...,19640,19640,19640,19640,19640,19640,19640,19640,19640,19640
unique,19640,3977,4033,3978,3997,3971,4033,4045,3989,3969,...,3995,3994,4011,4021,3991,3999,4002,4001,4035,3975
top,"(32417, 348)",1048,1048,1048,1048,1048,1048,1048,1048,1048,...,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048
freq,1,268,224,255,229,242,253,229,227,250,...,227,228,231,224,229,262,250,237,226,243


## Writing files

In [11]:
train_question_userid

Unnamed: 0,index_q,index_u,rating
215673,215673,2877,3
146276,146276,1906,3
137500,137500,1760,3
301580,301580,4031,3
327308,327308,4436,3
...,...,...,...
372462,372462,5058,3
361524,361524,4878,3
379922,379922,5199,3
125623,125623,1575,3


In [13]:
# train file
train_question_userid.to_csv('Data/so.train.rating', sep='\t', index=False, header=False)
# test file
test_question_userid.to_csv('Data/so.test.rating', sep='\t', index=False, header=False)

# negative samples
neg_samples.to_csv('Data/so.test.negative', sep='\t', index=False, header=False)