In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.model_selection
from sklearn.model_selection import train_test_split


In [27]:
dataset = pd.read_csv('../ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
dataset.head()



Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [28]:
# rank records based on the timestamp
dataset = dataset.sort_values(by='timestamp')
dataset.head()


Unnamed: 0,user_id,item_id,rating,timestamp
214,259,255,4,874724710
83965,259,286,4,874724727
43027,259,298,4,874724754
21396,259,185,4,874724781
82655,259,173,4,874724843


In [29]:
# save the sorted dataset
dataset.to_csv('u_sorted.data', sep='\t', index=False)

In [30]:
# load the sorted dataset
dataset = pd.read_csv('u_sorted.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'], index_col=False)
dataset[1000:2000]

Unnamed: 0,user_id,item_id,rating,timestamp
1000,195,779,2,874825826
1001,195,771,2,874825826
1002,195,1415,1,874825827
1003,756,258,3,874826502
1004,756,300,4,874826502
...,...,...,...,...
1995,21,370,1,874951293
1996,21,424,1,874951293
1997,21,767,1,874951314
1998,21,758,1,874951314


In [31]:
# since some users give ratings at the same time, we can't use timestamp
# I will shuffle the datasetand assume a new timestamp for each record
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,517,25,2,892659923
1,222,448,3,878183565
2,320,433,4,884751730
3,385,194,3,879441538
4,92,115,3,875654125


In [32]:
len(dataset)

100001

In [33]:
# add a column for the new timestamp
dataset['timestamp'] = np.arange(len(dataset))
dataset.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,517,25,2,0
1,222,448,3,1
2,320,433,4,2
3,385,194,3,3
4,92,115,3,4


In [34]:
# save the shuffled dataset
dataset.to_csv('shuffle_assume_timestamp/u_shuffled.data', sep='\t', index=False)


In [15]:
# load the shuffled dataset
dataset = pd.read_csv('shuffle_assume_timestamp/u_shuffled.data', sep='\s+', index_col=False)


In [16]:
dataset.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,517,25,2,0
1,222,448,3,1
2,320,433,4,2
3,385,194,3,3
4,92,115,3,4


In [17]:


# assume there are 10 time windows
n_windows = 10
window_size = len(dataset) // n_windows
# split the dataset into 10 time windows in order
# then for each time window, split it into training and testing set by 80% and 20%
# save the training and testing set into files

for i in range(n_windows):
    start = i * window_size
    end = (i + 1) * window_size
    dataset[:end].to_csv('shuffle_assume_timestamp/u' + str(i) + '.data', sep='\t', index=False)
    # train, test = train_test_split(dataset[start:end], test_size=0.2)
    # train.to_csv('shuffle_assume_timestamp/u' + str(i) + '_train.data', sep='\t', index=False)
    # test.to_csv('shuffle_assume_timestamp/u' + str(i) + '_test.data', sep='\t', index=False)
    


