# Import and format custom datasets for SCHOLAR

In [91]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

# Semi-synthetic data

In [92]:
# 1.0 | 0.5 | 0.0
gamma = 0.0

## load data and save in SCHOLAR format

In [93]:
if sys.platform == "darwin":
    pass
else:
    raw_data_path = "/nfs/home/maxa/data/semisynth_yelp/"
    output_dir = raw_data_path + "scholar_gamma"+str(gamma)+"/"
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/semisynth_yelp/ 
 /nfs/home/maxa/data/semisynth_yelp/scholar_gamma0.0/


## vocab

see empirical data

## dtm

In [64]:
x_bow_raw = pd.read_csv("/nfs/home/maxa/data/yelp_btr/preprocessed/yelp_dtm.csv", header = None).values

In [65]:
x_bow = np.matrix(x_bow_raw)

cut = int(0.8*x_bow.shape[0])
cut

x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

In [73]:
x_bow.shape

(50000, 24680)

In [74]:
gamma

0.5

In [75]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_gamma"+str(gamma), "train.npz"))
# test
#sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
#fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar", "test.npz"))

## doc IDs

In [94]:
n_obs = 50000

In [95]:
# insample
train_ids = ["train_" + str(x) for x in list(range(n_obs))]

In [96]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)

In [98]:
if gamma == 0.5:
    semisynth_data = pd.read_csv(raw_data_path + "yelp_semisynth_sample_gamma0.5.csv",header = 0)
    print("gamma: 0.5")
elif int(gamma) == 1:
    semisynth_data = pd.read_csv(raw_data_path + "yelp_semisynth_sample_gamma1.0.csv",header = 0)
    print("gamma: 1.0")
elif int(gamma) == 0:
    semisynth_data = pd.read_csv(raw_data_path + "yelp_semisynth_sample_gamma0.0.csv",header = 0)
    print("gamma: 0.0")

gamma: 0.0


In [99]:
semisynth_data.shape

(50000, 18)

In [100]:
semisynth_data.head(3)

Unnamed: 0,business_id,user_id,review_id,stars,name_u,review_count_u,stars_av_u,name_b,stars_av_b,review_count_b,text_clean,sentiment,US,PrUS,synth_y,doc_idx,pos_score,harvard_score
0,__8j8yhsmE98wNWHJNyAgw,z3gSZ8lkZLZyxSHFNuzM0A,ewqmng8J2Q66QXDyWc_3lw,1,K'Lee,3,2.0,Urawa Sushi,3.0,91,mindblown time order food never deliveri bad o...,-0.699038,0.0,0.5,2.444489,1,0.105263,-0.643889
1,__8j8yhsmE98wNWHJNyAgw,llgsKCdEdDnSU5LZnL92Jw,ngCyo4dT0YEJKlHcp3UQow,4,Dasa,59,4.1,Urawa Sushi,3.0,91,order ubereat night love menu huge tunasalmon ...,0.444332,1.0,0.5,3.719348,2,0.363636,1.898863
2,__8j8yhsmE98wNWHJNyAgw,gV9Y1fHKqnMRExRErIvFag,uE71Nvq6sKYtE4w_vyJngw,1,Eric,163,3.32,Urawa Sushi,3.0,91,easili worst sushi place ever order absolut id...,-0.189458,0.0,0.5,2.944093,3,0.136905,-0.113219


## labels

In [101]:
train_y = semisynth_data.synth_y

In [102]:
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [103]:
train_y

train_0        2.444489
train_1        3.719348
train_2        2.944093
train_3        1.616667
train_4        4.137294
                 ...   
train_49995    2.140177
train_49996    4.371907
train_49997    1.298292
train_49998    1.433057
train_49999    5.225657
Name: synth_y, Length: 50000, dtype: float64

## covariates

In [104]:
train_covars = semisynth_data[["US","stars_av_b"]].astype("float32") # choose which features to include

In [105]:
train_covars.index = train_ids

In [106]:
train_covars.to_csv(output_dir + "train.covars.csv")

In [107]:
train_covars.shape

(50000, 2)

In [108]:
train_covars

Unnamed: 0,US,stars_av_b
train_0,0.0,3.0
train_1,1.0,3.0
train_2,0.0,3.0
train_3,1.0,3.0
train_4,0.0,3.0
...,...,...
train_49995,1.0,3.0
train_49996,1.0,3.0
train_49997,0.0,3.0
train_49998,1.0,3.0


# For train-test split

In [46]:
output_dir_tt = "/nfs/home/maxa/data/semisynth_btr/scholar_split/"

## vocab

In [63]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_synth_vocab.csv', header = 0)

In [64]:
vocab_df

Unnamed: 0,x1
0,aaabsolut
1,aback
2,abandon
3,abbbey
4,abbey
...,...
16332,zud
16333,zuid
16334,zur
16335,zurich


In [65]:
vocab = list(vocab_df["x1"])
len(vocab)

16337

In [66]:
fh.write_to_json(vocab, output_dir_tt + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [47]:
x_bow.shape

(50000, 16337)

In [48]:
cut = int(0.8*x_bow.shape[0])
cut

40000

In [49]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

((40000, 16337), (10000, 16337))

In [27]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar_split", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar_split", "test.npz"))

## doc IDs

In [50]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [51]:
fh.write_to_json(train_ids, output_dir_tt + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir_tt + "test.ids.json", indent=2, sort_keys=True)

## labels

In [52]:
semisynth_data = pd.read_csv(raw_data_path + "booking_semisynth_sample.csv",header = 0)

In [53]:
semisynth_data.shape

(50000, 18)

In [54]:
semisynth_data.head(2)

Unnamed: 0,Average_Score,Reviewer_Nationality,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Leisure,Couple,text,text_clean,doc_idx,sentiment,pos_prop,av_score,synth_y,bar_rest,conf
0,8.1,Hungary,17,6373,12,6,10.0,0,0,Everything was great Cozy clean and just perf...,everyth great cozi clean just perfect breakfas...,1,1.062683,0.413793,-0.501244,3.547939,0,0
1,7.5,United Kingdom,20,2197,4,21,5.4,0,1,Good location . We travel through to Kensing...,good locat travel kensington everi week great ...,2,0.645357,0.166667,-1.574335,0.105846,0,0


In [55]:
data_y = semisynth_data.synth_y
train_y = data_y[:cut]
test_y = data_y[cut:]

In [56]:
train_y.index = train_ids
train_y.to_csv(output_dir_tt + "train.target.csv")

In [57]:
train_y

train_0         3.547939
train_1         0.105846
train_2        -0.681302
train_3         1.719629
train_4         3.303026
                 ...    
train_39995     6.130856
train_39996     1.963663
train_39997     1.792113
train_39998    10.060682
train_39999     9.967898
Name: synth_y, Length: 40000, dtype: float64

In [58]:
test_y.index = test_ids
test_y.to_csv(output_dir_tt + "test.target.csv")

In [59]:
test_y

test_0        9.388680
test_1        9.416876
test_2        2.281902
test_3        3.232687
test_4        4.279789
               ...    
test_9995    11.059970
test_9996     9.632751
test_9997     1.672547
test_9998     2.189951
test_9999     5.829085
Name: synth_y, Length: 10000, dtype: float64

## covariates

In [60]:
data_covars = semisynth_data[["Leisure","av_score"]].astype("float32") # choose which features to include
train_covars = data_covars[:cut]
test_covars = data_covars[cut:]

In [61]:
train_covars.index = train_ids
train_covars.to_csv(output_dir_tt + "train.covars.csv")
test_covars.index = test_ids
test_covars.to_csv(output_dir_tt + "test.covars.csv")

In [62]:
train_covars.shape, test_covars.shape

((40000, 2), (10000, 2))