In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.model_selection
from sklearn.model_selection import train_test_split


In [28]:
dataset = pd.read_csv('../ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
len(dataset)



100000

In [29]:
# rank records based on the timestamp
dataset = dataset.sort_values(by='timestamp')
dataset.head()


Unnamed: 0,user_id,item_id,rating,timestamp
214,259,255,4,874724710
83965,259,286,4,874724727
43027,259,298,4,874724754
21396,259,185,4,874724781
82655,259,173,4,874724843


In [30]:
# save the sorted dataset
dataset.to_csv('u_sorted.data', sep='\t', index=False)

In [31]:
# load the sorted dataset
dataset = pd.read_csv('u_sorted.data', sep='\t', index_col=False)
len(dataset)

100000

In [32]:
# since some users give ratings at the same time, we can't use timestamp
# I will shuffle the datasetand assume a new timestamp for each record
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,293,193,3,888905990
1,774,447,1,888557715
2,55,7,3,878176047
3,263,79,4,891298047
4,774,758,1,888559036


In [33]:
len(dataset)

100000

In [34]:
# add a column for the new timestamp
dataset['timestamp'] = np.arange(len(dataset))
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,293,193,3,0
1,774,447,1,1
2,55,7,3,2
3,263,79,4,3
4,774,758,1,4


In [35]:
# save the shuffled dataset
dataset.to_csv('shuffle_assume_timestamp/u_shuffled.data', sep='\t', index=False)


In [36]:
# load the shuffled dataset
dataset = pd.read_csv('shuffle_assume_timestamp/u_shuffled.data', sep='\s+', index_col=False)


In [37]:
print(len(dataset))

100000


In [40]:


# assume there are 10 time windows
n_windows = 10
window_size = len(dataset) // n_windows
print(window_size)
# split the dataset into 10 time windows in order
# then for each time window, split it into training and testing set by 80% and 20%
# save the training and testing set into files

for i in range(0, n_windows):
    start = i * window_size
    end = (i + 1) * window_size
    print(start, end)
    dataset[:end].to_csv('shuffle_assume_timestamp/u' + str(i) + '.data', sep='\t', index=False)
    # train, test = train_test_split(dataset[start:end], test_size=0.2)
    # train.to_csv('shuffle_assume_timestamp/u' + str(i) + '_train.data', sep='\t', index=False)
    # test.to_csv('shuffle_assume_timestamp/u' + str(i) + '_test.data', sep='\t', index=False)
    




9000
0 9000
9000 18000
18000 27000
27000 36000
36000 45000
45000 54000
54000 63000
63000 72000
72000 81000
81000 90000


In [39]:
# read u9 data and print length
dataset = pd.read_csv('shuffle_assume_timestamp/u8.data', sep='\t', index_col=False)
print(len(dataset))

90000
