In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

## Data Manipulation

#### Data Sampling

`n_c`: number of customers in the dataset.  
`n_m`: number of movies in the dataset.  
`n`: number of ratings.  
`target`: dataset with size $n_c\times n_m$.  
`df_probe`: validation dataset with size $100\times 3$.  
`df_target`: training dataset with size $n\times 3$.

training data

In [2]:
df = pd.read_csv('./Data/train.csv')
df_sorted = df.sort_values(by = 'CustomerID')

# choose part of the dataset
df_cut = df_sorted.iloc[:1000818,:]
target_cut = df_cut.pivot_table(index = 'CustomerID', columns = 'MovieID', values = 'Rating')
target_cut.head()

MovieID,1.0,2.0,3.0,4.0,5.0,6.0,8.0,11.0,12.0,13.0,...,17761.0,17762.0,17763.0,17764.0,17765.0,17766.0,17767.0,17768.0,17769.0,17770.0
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,5.0,,,,...,,,,3.0,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
25,,,,,,,,,,,...,,,,,,,,,,


In [3]:
df_movie = pd.read_csv('./Data/df_movie.csv')
target = target_cut.T.iloc[[i in list(df_movie.MovieID) for i in target_cut.T.index],:].T

# some of the target rows are all nan, we should rule them out
allnan = []
for i in range(target.shape[0]):
    if np.sum(~np.isnan(target.iloc[i,:])) != 0:
        allnan.append(i)
target = target.iloc[allnan,:]

validation data

In [4]:
df_cut_filter = df_cut.iloc[[i in list(df_movie.MovieID) for i in df_cut.MovieID],:]
df_probe = df_cut_filter.sample(n = 200, random_state = 1)
df_probe = df_probe.loc[[i in list(target.T.index) for i in df_probe.MovieID],:].iloc[:100,:]
df_probe.head()

Unnamed: 0,CustomerID,Rating,Date,MovieID
72298147,16786,2.0,2004-03-25,13074.0
53507935,17890,4.0,2004-09-17,9733.0
56574032,13210,3.0,2005-11-08,10358.0
76018689,20830,4.0,2004-09-22,13748.0
70227021,14225,3.0,2004-11-09,12732.0


Transform `target` into `df_target` which has the same form as `df_probe`.

In [5]:
df_target = []
u_id = target.index
i_id = target.T.index

with tqdm(total = target.shape[0]) as pbar:
    for i in range(target.shape[0]):
        for j in range(target.shape[1]):
            if not np.isnan(target.iloc[i,j]):
                df_target.append([u_id[i], i_id[j], target.iloc[i,j]])
        pbar.update(1)

100%|██████████| 4642/4642 [04:03<00:00, 19.03it/s]


In [6]:
df_target = pd.DataFrame(df_target, columns = ['CustomerID', 'MovieID', 'Rating'])
df_probe = df_probe[['CustomerID', 'MovieID', 'Rating']]

#### Save Useful Datasets

`df_cut`: Sampled data from `train.csv`.  
`target`: dataset with size $n_c\times n_m$.  
`df_probe`: validation dataset with size $100\times 3$.  
`df_target`: training dataset with size $n\times 3$.

In [7]:
df_cut.to_csv('./Data/df_cut.csv', index = False)
target.reset_index().to_csv('./Data/target.csv', index = False)
df_probe.to_csv('./Data/df_probe.csv', index = False)
df_target.to_csv('./Data/df_target.csv', index = False)