# Import and format custom datasets for SCHOLAR

In [1]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh
from sklearn.model_selection import train_test_split

## load data

In [7]:
if sys.platform == "darwin":
    #raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/synth_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/synth_btr/ 
 /nfs/home/maxa/data/synth_btr/scholar/


## vocab

In [122]:
vocab_df = pd.read_csv(raw_data_path +'/synth_vocab.csv', header = None)

In [265]:
vocab_df = vocab_df.astype(str)
vocab_df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


In [266]:
vocab = list(vocab_df[0])
len(vocab)

9

In [267]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [126]:
x_bow_raw = pd.read_csv(raw_data_path + "/synth_dtm_dense.csv", header = None).values

In [153]:
x_bow_raw

array([[ 0,  0,  0, ...,  6,  9,  9],
       [ 0,  0,  0, ..., 13, 10, 16],
       [ 0,  0,  0, ...,  6,  6,  0],
       ...,
       [15, 18, 15, ...,  0,  1,  1],
       [13, 16, 21, ...,  0,  0,  0],
       [18, 19, 13, ...,  0,  0,  0]])

In [128]:
x_bow = np.matrix(x_bow_raw)

In [129]:
x_bow.shape

(10000, 9)

In [130]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))

## doc IDs

In [131]:
# insample
train_ids = ["train_" + str(x) for x in list(range(x_bow.shape[0]))]

In [132]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)

In [133]:
synth_data = pd.read_csv(raw_data_path + "synth_data.csv",header = 0)

In [134]:
synth_data.shape

(10000, 6)

In [135]:
synth_data.head(3)

Unnamed: 0,doc_id,y,x1,Z_bar1,Z_bar2,Z_bar3
0,1,-0.129954,0.0,0.0,0.52,0.48
1,2,0.032899,0.0,0.0,0.22,0.78
2,3,-0.432909,0.0,0.0,0.76,0.24


## labels

In [136]:
train_y = synth_data.y

In [137]:
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [138]:
train_y

train_0      -0.129954
train_1       0.032899
train_2      -0.432909
train_3       0.052523
train_4      -0.103555
                ...   
train_9995   -0.399947
train_9996   -0.652622
train_9997   -0.596538
train_9998   -0.332729
train_9999   -0.652086
Name: y, Length: 10000, dtype: float64

## covariates

In [139]:
train_covars = synth_data[["x1"]].astype("float32") # choose which features to include

In [140]:
train_covars.index = train_ids

In [141]:
train_covars.to_csv(output_dir + "train.covars.csv")

In [142]:
train_covars.shape

(10000, 1)

In [144]:
train_covars

Unnamed: 0,x1
train_0,0.00
train_1,0.00
train_2,0.00
train_3,0.00
train_4,0.00
...,...
train_9995,0.28
train_9996,0.28
train_9997,0.30
train_9998,0.26


# For train-test split

In [145]:
output_dir_tt = "/nfs/home/maxa/data/synth_btr/scholar_split/"
try:
    os.mkdir(output_dir_tt)
except:
    pass

## vocab

In [268]:
vocab_df = pd.read_csv(raw_data_path +'/synth_vocab.csv', header = None)

In [269]:
vocab_df = vocab_df.astype(str)
vocab_df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


In [270]:
vocab = list(vocab_df[0])
len(vocab)

9

In [271]:
fh.write_to_json(vocab, output_dir_tt + "train.vocab.json", indent=2, sort_keys=True)

## Randomly assign train-test set

### join DTM and X

In [212]:
train_covars_org = synth_data[["x1"]].astype("float32") # choose which features to include
train_covars = np.array(train_covars_org.x1).reshape(train_covars_org.shape[0],1)
train_covars_org

Unnamed: 0,x1
0,0.00
1,0.00
2,0.00
3,0.00
4,0.00
...,...
9995,0.28
9996,0.28
9997,0.30
9998,0.26


In [213]:
x_bow_raw.shape, train_covars_org.shape

((10000, 9), (10000, 1))

In [214]:
X = np.concatenate((x_bow_raw,train_covars_org),axis=1)
X.shape

(10000, 10)

In [215]:
x_bow_raw

array([[ 0,  0,  0, ...,  6,  9,  9],
       [ 0,  0,  0, ..., 13, 10, 16],
       [ 0,  0,  0, ...,  6,  6,  0],
       ...,
       [15, 18, 15, ...,  0,  1,  1],
       [13, 16, 21, ...,  0,  0,  0],
       [18, 19, 13, ...,  0,  0,  0]])

In [227]:
X_train, X_test, train_y, test_y = train_test_split(X, synth_data.y, test_size=0.25, random_state=10)

In [244]:
x_bow_train = np.array(X_train[:,:9]).astype(int)
x_bow_test = np.array(X_test[:,:9]).astype(int)
train_covars = pd.Series(X_train[:,9])
test_covars = pd.Series(X_test[:,9])

In [246]:
test_covars

0       0.00
1       0.12
2       0.06
3       0.00
4       0.14
        ... 
2495    0.06
2496    0.06
2497    0.08
2498    0.18
2499    0.20
Length: 2500, dtype: float64

## dtm

In [247]:
x_bow_train.shape, x_bow_test.shape

((7500, 9), (2500, 9))

In [248]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_split", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar_split", "test.npz"))

## doc IDs

In [249]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [250]:
fh.write_to_json(train_ids, output_dir_tt + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir_tt + "test.ids.json", indent=2, sort_keys=True)

## labels

In [251]:
train_y.index = train_ids
train_y.to_csv(output_dir_tt + "train.target.csv")

In [252]:
train_y

train_0      -0.294585
train_1      -0.177712
train_2      -0.131219
train_3      -0.470653
train_4      -0.005867
                ...   
train_7495   -0.670180
train_7496   -0.057686
train_7497   -0.011384
train_7498   -0.343680
train_7499   -0.259444
Name: y, Length: 7500, dtype: float64

In [253]:
test_y.index = test_ids
test_y.to_csv(output_dir_tt + "test.target.csv")

In [254]:
test_y

test_0      -0.094081
test_1      -0.662811
test_2       0.190503
test_3       0.283396
test_4      -0.646839
               ...   
test_2495   -0.308992
test_2496   -0.215948
test_2497   -0.213456
test_2498   -0.067916
test_2499   -0.444408
Name: y, Length: 2500, dtype: float64

## covariates

In [255]:
train_covars.index = train_ids
train_covars.to_csv(output_dir_tt + "train.covars.csv")
test_covars.index = test_ids
test_covars.to_csv(output_dir_tt + "test.covars.csv")

In [256]:
train_covars.shape, test_covars.shape

((7500,), (2500,))

In [261]:
train_covars

train_0       0.02
train_1       0.12
train_2       0.04
train_3       0.08
train_4       0.06
              ... 
train_7495    0.32
train_7496    0.26
train_7497    0.00
train_7498    0.24
train_7499    0.06
Length: 7500, dtype: float64