# Import and format custom datasets for SCHOLAR

In [None]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

# Semi-synthetic data

## load data and save in SCHOLAR format

In [None]:
if sys.platform == "darwin":
    #raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/semisynth_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

## vocab

In [None]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_synth_vocab.csv', header = 0)

In [None]:
vocab_df

In [93]:
vocab = list(vocab_df["x1"])
len(vocab)

16337

In [94]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [16]:
x_bow_raw = pd.read_csv(raw_data_path + "preprocessed/booking_synth_dtm.csv", header = 0).values

In [19]:
x_bow = np.matrix(x_bow_raw)

In [21]:
x_bow.shape

((40000, 16337), (10000, 16337))

In [62]:
# insample
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))

## doc IDs

In [67]:
# insample
train_ids = ["train_" + str(x) for x in list(range(x_bow.shape[0]))]

In [80]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)

In [68]:
semisynth_data = pd.read_csv(raw_data_path + "booking_semisynth_sample.csv",header = 0)

In [69]:
semisynth_data.shape

(50000, 18)

In [70]:
semisynth_data.head(3)

Unnamed: 0,Average_Score,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Leisure,Couple,text,text_clean,doc_idx,sentiment,pos_prop,av_score,synth_y,bar_rest,conf
0,8.1,Hungary,17,6373,12,6,10.0,0,0,Everything was great Cozy clean and just perf...,everyth great cozi clean just perfect breakfas...,1,1.062683,0.413793,-0.501244,3.547939,0,0
1,7.5,United Kingdom,20,2197,4,21,5.4,0,1,Good location . We travel through to Kensing...,good locat travel kensington everi week great ...,2,0.645357,0.166667,-1.574335,0.105846,0,0
2,7.5,United Kingdom,26,2176,3,7,4.2,0,1,Not Much. Room was dirty carpet very tired w...,much room dirti carpet tire wifi stop work who...,3,-1.209426,0.103448,-1.574335,-0.681302,0,1


## labels

In [73]:
train_y = semisynth_data.Reviewer_Score

In [74]:
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [75]:
train_y

train_0        10.0
train_1         5.4
train_2         4.2
train_3         6.3
train_4         6.3
               ... 
train_49995     9.2
train_49996     9.6
train_49997     7.9
train_49998     7.9
train_49999     9.6
Name: Reviewer_Score, Length: 50000, dtype: float64

## covariates

In [76]:
train_covars = semisynth_data[["Leisure","Average_Score"]].astype("float32") # choose which features to include

In [77]:
train_covars.index = train_ids

In [78]:
train_covars.to_csv(output_dir + "train.covars.csv")

In [79]:
train_covars.shape

(50000, 2)