# Data Splitting

In [1]:
import numpy as np
import pandas as pd
from numpy import save, load

np.random.seed(1593523459)

In [2]:
TRIGGER = 'M'

if TRIGGER == 'K':
    valfrac = 0.25
    transpose = False
    data = np.loadtxt('../datasets/main/ml-100k.data', skiprows=0, delimiter='\t').astype('int32')

elif TRIGGER == 'M':
    valfrac = 0.1
    transpose = False
    data = np.loadtxt('../datasets/main/ml-1M.dat', skiprows=0, delimiter='::').astype('int32')

In [3]:
data

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [4]:
# Convert to DataFrame and filter the dataset by 'user' and 'item'
data_csv = pd.DataFrame(data, columns = ['user', 'item', 'rating', 'timestamp'])
data_csv.sort_values(['user', 'item'], ascending=[True, True], inplace=True)
data_csv.reset_index(inplace=True)
data_csv.drop('index', axis=1, inplace=True)
data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
1000204,6040,3683,4,960971696
1000205,6040,3703,4,964828575
1000206,6040,3735,4,960971654
1000207,6040,3751,4,964828782


In [5]:
users = np.unique(data[:, 0]).tolist() # list of unique users
items = np.unique(data[:, 1]).tolist() # list of unique items

In [6]:
n_u = len(users)     # number of users
n_m = len(items)     # number of movies
n_r = data.shape[0]  # number of ratings

print("USERS: {}\t ITEMS: {}\t RATINGS: {}".format(n_u, n_m, n_r))

USERS: 6040	 ITEMS: 3706	 RATINGS: 1000209


In [7]:
# Test and train size
test_size = int(n_r * valfrac)
train_size = n_r - test_size

In [8]:
# shuffle indices
idx = np.arange(n_r)
np.random.shuffle(idx)

In [9]:
# Test and train idx
test_idx = idx[:test_size]
train_idx = idx[test_size:]

In [10]:
# Copy data_csv
test_data_csv = data_csv.copy()
train_data_csv = data_csv.copy()

In [11]:
data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
1000204,6040,3683,4,960971696
1000205,6040,3703,4,964828575
1000206,6040,3735,4,960971654
1000207,6040,3751,4,964828782


In [12]:
train_data_csv = train_data_csv.drop(test_idx, axis=0)
train_data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
1000203,6040,3671,4,997454367
1000204,6040,3683,4,960971696
1000205,6040,3703,4,964828575
1000207,6040,3751,4,964828782


In [13]:
test_data_csv = test_data_csv.drop(train_idx, axis=0)
test_data_csv

Unnamed: 0,user,item,rating,timestamp
13,1,783,4,978824291
44,1,2762,4,978302091
46,1,2797,4,978302039
54,2,95,2,978300143
58,2,235,3,978299351
...,...,...,...,...
1000162,6040,3016,2,956716157
1000188,6040,3362,4,997453796
1000189,6040,3388,1,956716407
1000199,6040,3524,1,956716263


In [14]:
test_data = np.array(test_data_csv)
train_data = np.array(train_data_csv)

In [15]:
test_path = '../datasets/main/test_set_1M.npy'
train_path = '../datasets/main/train_set_1M.npy'

In [16]:
save(test_path, test_data)
save(train_path, train_data)

test_data_load = load(test_path)
train_data_load = load(train_path)

In [17]:
(test_data == test_data_load).all()

True

In [18]:
(train_data == train_data_load).all()

True

In [19]:
new_data_csv = pd.DataFrame(train_data_load, columns = ['user', 'item', 'rating', 'timestamp'])
new_data_csv

Unnamed: 0,user,item,rating,timestamp
0,1,1,5,978824268
1,1,48,5,978824351
2,1,150,5,978301777
3,1,260,4,978300760
4,1,527,5,978824195
...,...,...,...,...
900184,6040,3671,4,997454367
900185,6040,3683,4,960971696
900186,6040,3703,4,964828575
900187,6040,3751,4,964828782


In [20]:
test_data_load

array([[        1,       783,         4, 978824291],
       [        1,      2762,         4, 978302091],
       [        1,      2797,         4, 978302039],
       ...,
       [     6040,      3388,         1, 956716407],
       [     6040,      3524,         1, 956716263],
       [     6040,      3735,         4, 960971654]])