# Import and format custom datasets for SCHOLAR

In [None]:
import os, sys, scipy, json
from scipy import sparse
import codecs
import numpy as np
import pandas as pd
import file_handling as fh

## train.npz

In [None]:
data_dir = "/Users/maximilianahrens/OneDrive - Nexus365/nlp_models/scholar/data/imdb/processed/"

In [None]:
trainnpz = fh.load_sparse(data_dir + "train.npz")
#trainnpz = scipy.sparse.load_npz(data_dir + "train.npz")

In [None]:
train_dtm = trainnpz.todense()

In [None]:
print(train_dtm.shape)
train_dtm

## train.vocab

In [None]:
vocab = fh.read_json(data_dir + "train.vocab.json")

In [None]:
vocab

## labels and covars

In [None]:
label = pd.read_csv(data_dir + "train.sentiment_reg.csv", header=0, index_col=0)

In [None]:
label

# Booking

## load booking data and save in SCHOLAR format

In [None]:
if sys.platform == "darwin":
    raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/booking/booking_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/booking_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

## vocab

In [None]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/booking_vocab.csv', header = None)
vocab = list(vocab_df[0])
len(vocab)

In [None]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [None]:
x_bow_raw = pd.read_csv(raw_data_path + "preprocessed/booking_dtm.csv", header = None).values

In [None]:
x_bow = np.matrix(x_bow_raw)

In [None]:
cut = int(0.8*x_bow.shape[0])
cut

In [None]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

In [None]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar", "test.npz"))

## doc IDs

In [None]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [None]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir + "test.ids.json", indent=2, sort_keys=True)

In [None]:
train_y_org = pd.read_csv(data_path + "/booking_btr/booking_train_targets_sample.csv",
                          header = 0, ).reviewer_score_demeaned.values.astype("float32")
test_y_org = pd.read_csv(data_path + "/booking_btr/booking_test_targets_sample.csv",
                         header = 0).reviewer_score_demeaned.values.astype("float32")
# load numfeatures
train_numfeat = pd.read_csv(data_path + "/booking_btr/booking_normed_train_features_sample.csv",header = 0)
test_numfeat = pd.read_csv(data_path + "/booking_btr/booking_normed_test_features_sample.csv",header = 0)

## labels

In [None]:
train_y_org = pd.read_csv(raw_data_path + "/booking_train_targets_sample.csv",
                          header = 0)
test_y_org = pd.read_csv(raw_data_path + "/booking_test_targets_sample.csv",
                         header = 0)

In [None]:
train_y = train_y_org.reviewer_score_demeaned
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [None]:
test_y = test_y_org.reviewer_score_demeaned
test_y.index = test_ids
test_y.to_csv(output_dir + "test.target.csv")

## covariates

In [None]:
train_covars_raw = pd.read_csv(raw_data_path + "booking_normed_train_features_sample.csv",header = 0)
test_covars_raw = pd.read_csv(raw_data_path + "booking_normed_test_features_sample.csv",header = 0)

In [None]:
train_covars = train_covars_raw.iloc[:,:5].astype("float32") # choose which features to include
test_covars = test_covars_raw.iloc[:,:5].astype("float32") # choose which features to include

In [None]:
train_covars.index = train_ids
test_covars.index = test_ids

In [None]:
train_covars.to_csv(output_dir + "train.covars.csv")
test_covars.to_csv(output_dir + "test.covars.csv")

In [None]:
train_covars.shape, test_covars.shape

## check results

In [None]:
res_test = pd.read_csv("/Users/maximilianahrens/Downloads/predictions_test.csv", index_col = 0)

In [None]:
res_test[0]

# Yelp

## load Yelp data and save in SCHOLAR format

In [7]:
if sys.platform == "darwin":
    raw_data_path = "/Users/maximilianahrens/OneDrive - Nexus365/00_datasets/yelp/yelp_btr/"
    output_dir = raw_data_path + "scholar/"
else:
    raw_data_path = "/nfs/home/maxa/data/yelp_btr/"
    output_dir = raw_data_path + "scholar/"
print(raw_data_path, "\n",output_dir)

/nfs/home/maxa/data/yelp_btr/ 
 /nfs/home/maxa/data/yelp_btr/scholar/


## vocab

In [8]:
vocab_df = pd.read_csv(raw_data_path +'preprocessed/yelp_vocab.csv', header = None)
vocab = list(vocab_df[0])
len(vocab)

24680

In [15]:
fh.write_to_json(vocab, output_dir + "train.vocab.json", indent=2, sort_keys=True)

## dtm

In [17]:
x_bow_raw = pd.read_csv(raw_data_path + "preprocessed/yelp_dtm.csv", header = None).values

In [20]:
x_bow = np.matrix(x_bow_raw)

In [23]:
cut = int(0.75*x_bow.shape[0])
cut

37500

In [24]:
x_bow_train = x_bow[:cut] 
x_bow_test = x_bow[cut:]
x_bow_train.shape, x_bow_test.shape

((37500, 24680), (12500, 24680))

In [25]:
# train
sparse_Xtr = sparse.coo_matrix(x_bow_train).tocsr()
fh.save_sparse(sparse_Xtr, os.path.join(raw_data_path, "scholar", "train.npz"))
# test
sparse_Xte = sparse.coo_matrix(x_bow_test).tocsr()
fh.save_sparse(sparse_Xte, os.path.join(raw_data_path, "scholar", "test.npz"))

## doc IDs

In [26]:
train_ids = ["train_" + str(x) for x in list(range(x_bow_train.shape[0]))]
test_ids = ["test_" + str(x) for x in list(range(x_bow_test.shape[0]))]

In [27]:
fh.write_to_json(train_ids, output_dir + "train.ids.json", indent=2, sort_keys=True)
fh.write_to_json(test_ids, output_dir + "test.ids.json", indent=2, sort_keys=True)

In [30]:
yelp_data = pd.read_csv(raw_data_path + "/yelp_python.csv",header = 0)

In [33]:
y_data = yelp_data.stars

In [34]:
y_data.mean()

3.57826

## labels

In [37]:
train_y_org = pd.read_csv(raw_data_path + "/yelp_train_targets_sample_demeaned.csv",
                          header = None)
test_y_org = pd.read_csv(raw_data_path + "/yelp_test_targets_sample_demeaned.csv",
                         header = None)

In [43]:
train_y = train_y_org[0]
train_y.index = train_ids
train_y.to_csv(output_dir + "train.target.csv")

In [58]:
train_y

train_0       -2.57826
train_1        0.42174
train_2       -2.57826
train_3        0.42174
train_4        1.42174
                ...   
train_37495    1.42174
train_37496    0.42174
train_37497   -0.57826
train_37498    1.42174
train_37499    1.42174
Name: 0, Length: 37500, dtype: float64

In [45]:
test_y = test_y_org[0]
test_y.index = test_ids
test_y.to_csv(output_dir + "test.target.csv")

In [46]:
test_y

test_0       -2.57826
test_1        1.42174
test_2        0.42174
test_3       -1.57826
test_4        1.42174
               ...   
test_12495   -0.57826
test_12496    0.42174
test_12497   -0.57826
test_12498   -2.57826
test_12499    0.42174
Name: 0, Length: 12500, dtype: float64

## covariates

In [95]:
train_covars = pd.read_csv(raw_data_path + "yelp_normed_train_features_sample.csv",header = None)
test_covars = pd.read_csv(raw_data_path + "yelp_normed_test_features_sample.csv",header = None)

In [96]:
sentiment = pd.read_csv(raw_data_path + "yelp_sentiment.csv",header = 0)

In [97]:
train_sentiment = sentiment[:cut]
test_sentiment = sentiment[cut:]

In [98]:
test_sentiment.sentiment.values

array([-0.37846678, -1.30081119,  0.1853366 , ..., -1.17544172,
       -1.03532289,  1.58770156])

In [99]:
train_covars["sentiment"] = train_sentiment.sentiment.values
test_covars["sentiment"] = test_sentiment.sentiment.values

In [100]:
train_covars.index = train_ids
test_covars.index = test_ids

In [101]:
train_covars.to_csv(output_dir + "train.covars.csv")
test_covars.to_csv(output_dir + "test.covars.csv")

In [102]:
train_covars.shape, test_covars.shape

((37500, 3), (12500, 3))

In [104]:
train_covars

Unnamed: 0,0,1,sentiment
train_0,-2.173855,-0.891371,-0.699038
train_1,0.682144,-0.891371,0.444332
train_2,-0.378655,-0.891371,-0.189458
train_3,0.627744,-0.891371,-0.699038
train_4,0.818144,-0.891371,0.994843
...,...,...,...
train_37495,1.049344,0.632445,-0.823996
train_37496,0.668544,0.632445,1.587702
train_37497,-1.412255,0.632445,-0.699038
train_37498,0.695744,0.632445,-0.186316


## check results

In [None]:
res_test = pd.read_csv("/Users/maximilianahrens/Downloads/predictions_test.csv", index_col = 0)

In [None]:
res_test[0]