In [1]:
import numpy as np
import pandas as pd
from interactions_data import load_wide, make_long, make_wide, create_index
from sklearn.model_selection import train_test_split

## Load data in wide format   
  - assign index name
  - reset index
  - shift all user IDs by 1
  - drop columns that are all NaNs
  - drop rows that are all NaNs
  - reindex column names, starting from 1

In [5]:
path = 'data/jester/'

In [6]:
wide = load_wide(path+'jester-data-3.csv')

In [7]:
wide.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,...,130,131,132,133,134,135,136,137,138,139
0,0,0.21875,-9.28125,-9.28125,-6.78125,0.875,-9.65625,-9.03125,-7.46875,-8.71875,...,,,,,,,,,,
1,1,-9.6875,9.9375,9.53125,9.9375,0.40625,3.71875,9.65625,-2.6875,-9.5625,...,,,,,,,,,,
2,2,-9.84375,-9.84375,-7.21875,-2.03125,-9.9375,-9.96875,-9.875,-9.8125,-9.78125,...,,,,,,,,,,
3,3,6.90625,4.75,-5.90625,-0.40625,-4.03125,3.875,6.21875,5.65625,6.09375,...,,,,,,,,,,
4,4,-0.03125,-9.09375,-0.40625,7.5,-7.21875,-9.4375,0.125,-9.15625,3.65625,...,,,,,,,,,,


In [8]:
wide.shape

(50692, 141)

## Convert to long format
  - use `df.melt()` function
  - drop rows with NaN ratings

In [9]:
long = make_long(wide, 'user_id', 'joke_id', 'rating')

In [10]:
long.sample(5, random_state=42)

Unnamed: 0,user_id,joke_id,rating
65206,14514,1,2.15625
176590,24514,3,4.90625
4397609,38097,86,5.875
6008712,27056,118,5.46875
6145708,11976,121,5.53125


In [11]:
long.shape

(1728847, 3)

In [12]:
long.to_csv(path+'long.csv', index=False)

### Create test set

In [9]:
train, test, _, _ = train_test_split(long, long,
                                     test_size = 0.25,
                                     random_state=42,
                                     stratify=long.user_id)

In [10]:
train.shape

(1296635, 3)

In [11]:
test.shape

(432212, 3)

In [12]:
train.to_csv(path+'train_long.csv', index=False)
test.to_csv(path+'test_long.csv', index=False)

## Load data in wide format

In [15]:
train = pd.read_csv(path+'train_long.csv', index_col=False)

In [19]:
test = pd.read_csv(path+'test_long.csv', index_col=False)

In [20]:
test.head()

Unnamed: 0,user_id,joke_id,rating
0,30173,5,-9.4375
1,22868,133,-3.0
2,8434,71,-4.5625
3,30260,2,2.5625
4,29438,8,1.84375


## Convert to wide format
  - use `pd.pivot_table()` function
  - reorder columns
  - reset index
  - delete column index name

In [14]:
make_wide(train, 'rating', 'user_id', 'joke_id').drop('user_id', axis=1).to_csv(path+'train.csv', index=False)
make_wide(test, 'rating', 'user_id', 'joke_id').drop('user_id', axis=1).to_csv(path+'test.csv', index=False)