# Import and format custom datasets for SCHOLAR

In [1]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

# Semi-synthetic data BOOKING

## load data and save in SCHOLAR format

In [2]:
if sys.platform == "darwin":
    #raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/semisynth_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/semisynth_btr/ 
 /nfs/home/maxa/data/semisynth_btr/scholar/


## vocab

In [3]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_synth_vocab.csv', header = 0)

In [4]:
vocab_df

Unnamed: 0,x1
0,aaabsolut
1,aback
2,abandon
3,abbbey
4,abbey
...,...
16332,zud
16333,zuid
16334,zur
16335,zurich


In [93]:
vocab = list(vocab_df["x1"])
len(vocab)

16337

In [94]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [6]:
x_bow_raw = pd.read_csv(raw_data_path + "preprocessed/booking_synth_dtm.csv", header = 0).values

In [7]:
x_bow = np.matrix(x_bow_raw)

In [8]:
x_bow.shape

(50000, 16337)

In [62]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))

## doc IDs

In [9]:
# insample
train_ids = ["train_" + str(x) for x in list(range(x_bow.shape[0]))]

In [80]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)

In [10]:
semisynth_data = pd.read_csv(raw_data_path + "booking_semisynth_sample.csv",header = 0)

In [11]:
semisynth_data.shape

(50000, 18)

In [12]:
semisynth_data.head(3)

Unnamed: 0,Average_Score,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Leisure,Couple,text,text_clean,doc_idx,sentiment,pos_prop,av_score,synth_y,bar_rest,conf
0,8.1,Hungary,17,6373,12,6,10.0,0,0,Everything was great Cozy clean and just perf...,everyth great cozi clean just perfect breakfas...,1,1.062683,0.413793,-0.501244,3.547939,0,0
1,7.5,United Kingdom,20,2197,4,21,5.4,0,1,Good location . We travel through to Kensing...,good locat travel kensington everi week great ...,2,0.645357,0.166667,-1.574335,0.105846,0,0
2,7.5,United Kingdom,26,2176,3,7,4.2,0,1,Not Much. Room was dirty carpet very tired w...,much room dirti carpet tire wifi stop work who...,3,-1.209426,0.103448,-1.574335,-0.681302,0,1


## labels

In [13]:
train_y = semisynth_data.synth_y

In [14]:
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [15]:
train_y

train_0         3.547939
train_1         0.105846
train_2        -0.681302
train_3         1.719629
train_4         3.303026
                 ...    
train_49995    11.059970
train_49996     9.632751
train_49997     1.672547
train_49998     2.189951
train_49999     5.829085
Name: synth_y, Length: 50000, dtype: float64

## covariates

In [16]:
train_covars = semisynth_data[["Leisure","av_score"]].astype("float32") # choose which features to include

In [17]:
train_covars.index = train_ids

In [18]:
train_covars.to_csv(output_dir + "train.covars.csv")

In [19]:
train_covars.shape

(50000, 2)

In [20]:
train_covars

Unnamed: 0,Leisure,av_score
train_0,0.0,-0.501244
train_1,0.0,-1.574335
train_2,0.0,-1.574335
train_3,0.0,-0.501244
train_4,0.0,-2.468577
...,...,...
train_49995,1.0,0.035301
train_49996,1.0,2.002634
train_49997,0.0,-0.143547
train_49998,1.0,0.035301


# For train-test split

In [46]:
output_dir_tt = "/nfs/home/maxa/data/semisynth_btr/scholar_split/"

## vocab

In [63]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_synth_vocab.csv', header = 0)

In [64]:
vocab_df

Unnamed: 0,x1
0,aaabsolut
1,aback
2,abandon
3,abbbey
4,abbey
...,...
16332,zud
16333,zuid
16334,zur
16335,zurich


In [65]:
vocab = list(vocab_df["x1"])
len(vocab)

16337

In [66]:
fh.write_to_json(vocab, output_dir_tt + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [47]:
x_bow.shape

(50000, 16337)

In [48]:
cut = int(0.8*x_bow.shape[0])
cut

40000

In [49]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

((40000, 16337), (10000, 16337))

In [27]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_split", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar_split", "test.npz"))

## doc IDs

In [50]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [51]:
fh.write_to_json(train_ids, output_dir_tt + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir_tt + "test.ids.json", indent=2, sort_keys=True)

## labels

In [52]:
semisynth_data = pd.read_csv(raw_data_path + "booking_semisynth_sample.csv",header = 0)

In [53]:
semisynth_data.shape

(50000, 18)

In [54]:
semisynth_data.head(2)

Unnamed: 0,Average_Score,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Leisure,Couple,text,text_clean,doc_idx,sentiment,pos_prop,av_score,synth_y,bar_rest,conf
0,8.1,Hungary,17,6373,12,6,10.0,0,0,Everything was great Cozy clean and just perf...,everyth great cozi clean just perfect breakfas...,1,1.062683,0.413793,-0.501244,3.547939,0,0
1,7.5,United Kingdom,20,2197,4,21,5.4,0,1,Good location . We travel through to Kensing...,good locat travel kensington everi week great ...,2,0.645357,0.166667,-1.574335,0.105846,0,0


In [55]:
data_y = semisynth_data.synth_y
train_y = data_y[:cut]
test_y = data_y[cut:]

In [56]:
train_y.index = train_ids
train_y.to_csv(output_dir_tt + "train.target.csv")

In [57]:
train_y

train_0         3.547939
train_1         0.105846
train_2        -0.681302
train_3         1.719629
train_4         3.303026
                 ...    
train_39995     6.130856
train_39996     1.963663
train_39997     1.792113
train_39998    10.060682
train_39999     9.967898
Name: synth_y, Length: 40000, dtype: float64

In [58]:
test_y.index = test_ids
test_y.to_csv(output_dir_tt + "test.target.csv")

In [59]:
test_y

test_0        9.388680
test_1        9.416876
test_2        2.281902
test_3        3.232687
test_4        4.279789
               ...    
test_9995    11.059970
test_9996     9.632751
test_9997     1.672547
test_9998     2.189951
test_9999     5.829085
Name: synth_y, Length: 10000, dtype: float64

## covariates

In [60]:
data_covars = semisynth_data[["Leisure","av_score"]].astype("float32") # choose which features to include
train_covars = data_covars[:cut]
test_covars = data_covars[cut:]

In [61]:
train_covars.index = train_ids
train_covars.to_csv(output_dir_tt + "train.covars.csv")
test_covars.index = test_ids
test_covars.to_csv(output_dir_tt + "test.covars.csv")

In [62]:
train_covars.shape, test_covars.shape

((40000, 2), (10000, 2))