# Import and format custom datasets for SCHOLAR

In [80]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

## train.npz

In [8]:
data_dir = "/Users/maximilianahrens/OneDrive - Nexus365/nlp_models/scholar/data/imdb/processed/"

In [12]:
trainnpz = fh.load_sparse(data_dir + "train.npz")
#trainnpz = scipy.sparse.load_npz(data_dir + "train.npz")

In [15]:
train_dtm = trainnpz.todense()

In [17]:
print(train_dtm.shape)
train_dtm

(25000, 2000)


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

## train.vocab

In [20]:
vocab = fh.read_json(data_dir + "train.vocab.json")

In [None]:
vocab

## labels and covars

In [26]:
label = pd.read_csv(data_dir + "train.sentiment_reg.csv", header=0, index_col=0)

In [27]:
label

Unnamed: 0,reg
train_127,3.725033
train_126,5.864601
train_125,3.994651
train_124,6.008353
train_123,1.329393
...,...
train_12420,7.776060
train_12419,3.366420
train_12418,4.071716
train_12417,9.920750


## load booking data and save in SCHOLAR format

In [60]:
if sys.platform == "darwin":
    raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/booking_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/ 
 /Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/scholar/


## vocab

In [69]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_vocab.csv', header = None)
vocab = list(vocab_df[0])
len(vocab)

6968

In [70]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [101]:
x_bow_raw = pd.read_csv(raw_data_path + "preprocessed/booking_dtm.csv", header = None).values

In [116]:
x_bow = np.matrix(x_bow_raw)

In [117]:
cut = int(0.8*x_bow.shape[0])
cut

40000

In [119]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

((40000, 6968), (10000, 6968))

In [120]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar", "test.npz"))

## doc IDs

In [141]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [143]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir + "test.ids.json", indent=2, sort_keys=True)

In [None]:
train_y_org = pd.read_csv(data_path + "/booking_btr/booking_train_targets_sample.csv",
                          header = 0, ).reviewer_score_demeaned.values.astype("float32")
test_y_org = pd.read_csv(data_path + "/booking_btr/booking_test_targets_sample.csv",
                         header = 0).reviewer_score_demeaned.values.astype("float32")
# load numfeatures
train_numfeat = pd.read_csv(data_path + "/booking_btr/booking_normed_train_features_sample.csv",header = 0)
test_numfeat = pd.read_csv(data_path + "/booking_btr/booking_normed_test_features_sample.csv",header = 0)

## labels

In [155]:
train_y_org = pd.read_csv(raw_data_path + "/booking_train_targets_sample.csv",
                          header = 0)
test_y_org = pd.read_csv(raw_data_path + "/booking_test_targets_sample.csv",
                         header = 0)

In [162]:
train_y = train_y_org.reviewer_score_demeaned
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [165]:
test_y = test_y_org.reviewer_score_demeaned
test_y.index = test_ids
test_y.to_csv(output_dir + "test.target.csv")

## covariates

In [177]:
train_covars_raw = pd.read_csv(raw_data_path + "booking_normed_train_features_sample.csv",header = 0)
test_covars_raw = pd.read_csv(raw_data_path + "booking_normed_test_features_sample.csv",header = 0)

In [178]:
train_covars = train_covars_raw.iloc[:,:5].astype("float32") # choose which features to include
test_covars = test_covars_raw.iloc[:,:5].astype("float32") # choose which features to include

In [180]:
train_covars.index = train_ids
test_covars.index = test_ids

In [182]:
train_covars.to_csv(output_dir + "train.covars.csv")
test_covars.to_csv(output_dir + "test.covars.csv")

In [183]:
train_covars.shape, test_covars.shape

((40000, 5), (10000, 5))