# Import and format custom datasets for SCHOLAR

In [166]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh
from sklearn.model_selection import train_test_split
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## load data

In [15]:
if sys.platform == "darwin":
    #raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/synth_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/synth_btr/ 
 /nfs/home/maxa/data/synth_btr/scholar/


In [23]:
output_dir = raw_data_path + "scholar_multireg/"
output_dir
try:
    os.mkdir(output_dir)
except:
    pass

## vocab

In [24]:
vocab_df = pd.read_csv(raw_data_path +'/synth_vocab.csv', header = None)

In [25]:
vocab_df = vocab_df.astype(str)
vocab_df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


In [26]:
vocab = list(vocab_df[0])
len(vocab)

9

In [27]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [214]:
x_bow_raw = pd.read_csv(raw_data_path + "/synth_dtm_dense.csv", header = None).values

In [215]:
x_bow_raw

array([[ 0,  0,  0, ...,  6,  9,  9],
       [ 0,  0,  0, ..., 13, 10, 16],
       [ 0,  0,  0, ...,  6,  6,  0],
       ...,
       [15, 18, 15, ...,  0,  1,  1],
       [13, 16, 21, ...,  0,  0,  0],
       [18, 19, 13, ...,  0,  0,  0]])

In [216]:
x_bow = np.matrix(x_bow_raw)

In [217]:
x_bow.shape

(10000, 9)

In [219]:
x_bow = x_bow[:2500]

In [220]:
x_bow.shape

(2500, 9)

In [32]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))

In [218]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_multireg", "train.npz"))

In [221]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_multireg", "test.npz"))

## doc IDs

In [33]:
# insample
train_ids = ["train_" + str(x) for x in list(range(x_bow.shape[0]))]

In [34]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)

In [35]:
synth_data = pd.read_csv(raw_data_path + "synth_data.csv",header = 0)

In [36]:
synth_data.shape

(10000, 6)

In [37]:
synth_data.head(3)

Unnamed: 0,doc_id,y,x1,Z_bar1,Z_bar2,Z_bar3
0,1,-0.129954,0.0,0.0,0.52,0.48
1,2,0.032899,0.0,0.0,0.22,0.78
2,3,-0.432909,0.0,0.0,0.76,0.24


## labels

In [58]:
train_y = synth_data.y

In [59]:
train_y2 = train_y**2 + 0.5

In [60]:
train_y3 = train_y**4 - 2

In [61]:
train_y = pd.DataFrame([train_y, train_y2, train_y3]).T

In [63]:
train_y.columns = ["y1","y2","y3"]

In [64]:
train_y

Unnamed: 0,y1,y2,y3
0,-0.129954,0.516888,-1.999715
1,0.032899,0.501082,-1.999999
2,-0.432909,0.687410,-1.964877
3,0.052523,0.502759,-1.999992
4,-0.103555,0.510724,-1.999885
...,...,...,...
9995,-0.399947,0.659957,-1.974414
9996,-0.652622,0.925915,-1.818596
9997,-0.596538,0.855858,-1.873365
9998,-0.332729,0.610709,-1.987744


In [65]:
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [66]:
train_y

Unnamed: 0,y1,y2,y3
train_0,-0.129954,0.516888,-1.999715
train_1,0.032899,0.501082,-1.999999
train_2,-0.432909,0.687410,-1.964877
train_3,0.052523,0.502759,-1.999992
train_4,-0.103555,0.510724,-1.999885
...,...,...,...
train_9995,-0.399947,0.659957,-1.974414
train_9996,-0.652622,0.925915,-1.818596
train_9997,-0.596538,0.855858,-1.873365
train_9998,-0.332729,0.610709,-1.987744


## covariates

In [115]:
train_covars = synth_data[["x1"]].astype("float32") # choose which features to include

In [116]:
train_covars2 = train_covars**2 - 3

In [117]:
train_covars = pd.concat([train_covars,train_covars2],axis =1)
train_covars.columns = ["x1","x2"]
train_covars

Unnamed: 0,x1,x2
0,0.00,-3.0000
1,0.00,-3.0000
2,0.00,-3.0000
3,0.00,-3.0000
4,0.00,-3.0000
...,...,...
9995,0.28,-2.9216
9996,0.28,-2.9216
9997,0.30,-2.9100
9998,0.26,-2.9324


In [118]:
train_covars.index = train_ids

In [119]:
train_covars.to_csv(output_dir + "train.covars.csv")

In [120]:
train_covars.shape

(10000, 2)

In [121]:
train_covars

Unnamed: 0,x1,x2
train_0,0.00,-3.0000
train_1,0.00,-3.0000
train_2,0.00,-3.0000
train_3,0.00,-3.0000
train_4,0.00,-3.0000
...,...,...
train_9995,0.28,-2.9216
train_9996,0.28,-2.9216
train_9997,0.30,-2.9100
train_9998,0.26,-2.9324


# For train-test split

In [145]:
output_dir_tt = "/nfs/home/maxa/data/synth_btr/scholar_split/"
try:
    os.mkdir(output_dir_tt)
except:
    pass

In [122]:
output_dir_tt = output_dir

## vocab

In [268]:
vocab_df = pd.read_csv(raw_data_path +'/synth_vocab.csv', header = None)

In [269]:
vocab_df = vocab_df.astype(str)
vocab_df

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


In [270]:
vocab = list(vocab_df[0])
len(vocab)

9

In [271]:
fh.write_to_json(vocab, output_dir_tt + "train.vocab.json", indent=2, sort_keys=True)

## Randomly assign train-test set

### join DTM and X

In [212]:
train_covars_org = synth_data[["x1"]].astype("float32") # choose which features to include
train_covars = np.array(train_covars_org.x1).reshape(train_covars_org.shape[0],1)
train_covars_org

Unnamed: 0,x1
0,0.00
1,0.00
2,0.00
3,0.00
4,0.00
...,...
9995,0.28
9996,0.28
9997,0.30
9998,0.26


In [213]:
x_bow_raw.shape, train_covars_org.shape

((10000, 9), (10000, 1))

In [214]:
X = np.concatenate((x_bow_raw,train_covars_org),axis=1)
X.shape

(10000, 10)

In [215]:
x_bow_raw

array([[ 0,  0,  0, ...,  6,  9,  9],
       [ 0,  0,  0, ..., 13, 10, 16],
       [ 0,  0,  0, ...,  6,  6,  0],
       ...,
       [15, 18, 15, ...,  0,  1,  1],
       [13, 16, 21, ...,  0,  0,  0],
       [18, 19, 13, ...,  0,  0,  0]])

In [227]:
X_train, X_test, train_y, test_y = train_test_split(X, synth_data.y, test_size=0.25, random_state=10)

In [244]:
x_bow_train = np.array(X_train[:,:9]).astype(int)
x_bow_test = np.array(X_test[:,:9]).astype(int)
train_covars = pd.Series(X_train[:,9])
test_covars = pd.Series(X_test[:,9])

In [246]:
test_covars

0       0.00
1       0.12
2       0.06
3       0.00
4       0.14
        ... 
2495    0.06
2496    0.06
2497    0.08
2498    0.18
2499    0.20
Length: 2500, dtype: float64

## dtm

In [247]:
x_bow_train.shape, x_bow_test.shape

((7500, 9), (2500, 9))

In [248]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_split", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar_split", "test.npz"))

## doc IDs

In [249]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [250]:
fh.write_to_json(train_ids, output_dir_tt + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir_tt + "test.ids.json", indent=2, sort_keys=True)

## labels

In [251]:
train_y.index = train_ids
train_y.to_csv(output_dir_tt + "train.target.csv")

In [252]:
train_y

train_0      -0.294585
train_1      -0.177712
train_2      -0.131219
train_3      -0.470653
train_4      -0.005867
                ...   
train_7495   -0.670180
train_7496   -0.057686
train_7497   -0.011384
train_7498   -0.343680
train_7499   -0.259444
Name: y, Length: 7500, dtype: float64

In [253]:
test_y.index = test_ids
test_y.to_csv(output_dir_tt + "test.target.csv")

In [254]:
test_y

test_0      -0.094081
test_1      -0.662811
test_2       0.190503
test_3       0.283396
test_4      -0.646839
               ...   
test_2495   -0.308992
test_2496   -0.215948
test_2497   -0.213456
test_2498   -0.067916
test_2499   -0.444408
Name: y, Length: 2500, dtype: float64

## covariates

In [255]:
train_covars.index = train_ids
train_covars.to_csv(output_dir_tt + "train.covars.csv")
test_covars.index = test_ids
test_covars.to_csv(output_dir_tt + "test.covars.csv")

In [256]:
train_covars.shape, test_covars.shape

((7500,), (2500,))

In [261]:
train_covars

train_0       0.02
train_1       0.12
train_2       0.04
train_3       0.08
train_4       0.06
              ... 
train_7495    0.32
train_7496    0.26
train_7497    0.00
train_7498    0.24
train_7499    0.06
Length: 7500, dtype: float64

In [131]:
test_y_multi = pd.read_csv("/nfs/home/maxa/data/synth_btr/scholar_split/test.target.csv", index_col = 0)

In [135]:
test_y_multi["y2"] = test_y_multi.y**2 + 0.5
test_y_multi["y3"] = test_y_multi.y**4 - 2

In [138]:
test_y_multi.columns = ["y1","y2","y3"]

In [140]:
test_y_multi.to_csv("/nfs/home/maxa/data/synth_btr/scholar_multireg/test.target.csv")

In [141]:
test_x_multi = pd.read_csv("/nfs/home/maxa/data/synth_btr/scholar_split/test.covars.csv", index_col = 0)

In [146]:
test_x_multi["x2"] = test_x_multi["0"]**2 - 3

In [148]:
test_x_multi.columns = ["x1","x2"]

In [149]:
test_x_multi

Unnamed: 0,x1,x2
test_0,0.00,-3.0000
test_1,0.12,-2.9856
test_2,0.06,-2.9964
test_3,0.00,-3.0000
test_4,0.14,-2.9804
...,...,...
test_2495,0.06,-2.9964
test_2496,0.06,-2.9964
test_2497,0.08,-2.9936
test_2498,0.18,-2.9676


In [150]:
test_x_multi.to_csv("/nfs/home/maxa/data/synth_btr/scholar_multireg/test.covars.csv")

In [162]:
test_y_multi_pred.y3 = test_y_multi.y3*1.5

In [163]:
np.sum((test_y_multi - test_y_multi_pred)**2) / float(test_y_multi_pred.shape[0])

y1    0.250000
y2    0.168888
y3    0.959500
dtype: float64

In [224]:
task_loss = 0.

In [243]:
task_loss = np.sum((test_y_multi - test_y_multi_pred )**2) / float(test_y_multi_pred.shape[0])
type(task_loss)

pandas.core.series.Series

In [245]:
np.var(test_y_multi).values

array([0.07864316, 0.02889749, 0.01494024])

In [242]:
1-0.25/0.078643

-2.178922472438742

In [244]:
type(1-(task_loss/np.var(test_y_multi)))

pandas.core.series.Series

In [236]:
np.sum(task_loss)

1.3783883189849462

In [234]:
Y = np.array(test_y_multi)
Y_pred = np.array(test_y_multi_pred)

In [237]:
pd.DataFrame(Y)

Unnamed: 0,0,1,2
0,-0.094081,0.508851,-1.999922
1,-0.662811,0.939318,-1.806999
2,0.190503,0.536292,-1.998683
3,0.283396,0.580313,-1.993550
4,-0.646839,0.918401,-1.824940
...,...,...,...
2495,-0.308992,0.595476,-1.990884
2496,-0.215948,0.546634,-1.997825
2497,-0.213456,0.545563,-1.997924
2498,-0.067916,0.504613,-1.999979


In [235]:
task_loss = np.sum((Y - Y_pred )**2) / float(test_y_multi_pred.shape[0])
task_loss

1.3783883189849462

In [235]:
task_loss = np.sum((Y - Y_pred )**2) / float(test_y_multi_pred.shape[0])
task_loss

1.3783883189849462

In [172]:
Y = tf.constant(test_y_multi)
Y_pred = tf.constant(test_y_multi_pred)
Y, Y_pred

(<tf.Tensor 'Const_4:0' shape=(2500, 3) dtype=float64>,
 <tf.Tensor 'Const_5:0' shape=(2500, 3) dtype=float64>)

In [190]:
res = tf.reduce_sum(tf.squared_difference(Y, Y_pred),1) # mse

In [191]:
tf.squared_difference(Y, Y_pred)

<tf.Tensor 'SquaredDifference_5:0' shape=(2500, 3) dtype=float64>

In [192]:
out= tf.reduce_sum(tf.squared_difference(Y, Y_pred), axis=1, keepdims=True)

In [193]:
out

<tf.Tensor 'Sum_4:0' shape=(2500, 1) dtype=float64>

In [194]:
tensor = tf.range(10)
tf.print(tensor, output_stream=sys.stderr)

<tf.Operation 'PrintV2' type=PrintV2>

In [195]:
tensor

<tf.Tensor 'range:0' shape=(10,) dtype=int32>

In [196]:
tf.print(tensor)

<tf.Operation 'PrintV2_1' type=PrintV2>

In [197]:
dtm2 = np.load("/nfs/home/maxa/data/synth_btr/scholar_multireg/test.npz")

In [206]:
dtm2["data"].shape

(19980,)