In [6]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
FOLDERNAME = 'UCY/NLP/AirBnB_project'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
ROOT_PATH = '/content/drive/My Drive/{}'.format(FOLDERNAME)
sys.path.append(ROOT_PATH)
%cd /content/drive/My\ Drive/$FOLDERNAME/

SUBMODULES = ['processing', 'utils', 'Dataset']

import sys
for module in SUBMODULES:
  sys.path.append('/content/drive/My Drive/{}/{}'.format(FOLDERNAME, module))

EMBEDDINGS_PATH = f'{ROOT_PATH}/embeddings'
DATASET_PATH = f'{ROOT_PATH}/Dataset'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/UCY/NLP/AirBnB_project


In [4]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from dataloader import Dataloader

In [31]:
data = Dataloader(
    listing_path = f'{DATASET_PATH}/listings',
    comments_path = f'{DATASET_PATH}/comments',
)
df = data.getListings()

scores_conlumns = [ col for col in df.columns if "score" in col ]
scores = df[scores_conlumns]

# loads pre_processed data
with open(f'{EMBEDDINGS_PATH}/listing_name_embeddings.pkl', 'rb') as f:
    listing_name = pickle.load(f)
    
with open(f'{EMBEDDINGS_PATH}/listing_description_embeddings.pkl', 'rb') as f:
    listing_description = pickle.load(f)

with open(f'{EMBEDDINGS_PATH}/listing_neighborhood_overview_embeddings.pkl', 'rb') as f:
    listing_neighborhood = pickle.load(f)

with open(f'{EMBEDDINGS_PATH}/listing_host_about_embeddings.pkl', 'rb') as f:
    listing_host_about = pickle.load(f)

with open(f'{EMBEDDINGS_PATH}/listing_processed_data.pkl', 'rb') as f:
    listings_numeric = pickle.load(f)

with open(f'{EMBEDDINGS_PATH}/comments_embeddings.pkl', 'rb') as f:
    comments = pickle.load(f)

with open(f'{EMBEDDINGS_PATH}/comments_mean_emb.pkl', 'rb') as f:
    comments_mean = pickle.load(f)

In [15]:
print("listing_name:", listing_name.shape)
print("listing_description:", listing_description.shape)
print("listing_neighborhood:", listing_neighborhood.shape)
print("listing_host_about:", listing_host_about.shape)
print("listings_numeric:", listings_numeric.shape)
print("comments:", comments.shape)
print("comments_mean:", comments_mean.shape)

listing_name: (6998, 384)
listing_description: (6998, 384)
listing_neighborhood: (6998, 384)
listing_host_about: (6998, 384)
listings_numeric: (6998, 34)
comments: (6998, 384)
comments_mean: (384,)


In [43]:
# A dataset class that can be used to access to the dataset while granting some extra functions
class Dataset():
  def __init__(self, x: pd.DataFrame, y: pd.DataFrame) -> None:
      self.x=x
      self.y=y

  def get(self):
    return self.x, self.y.review_scores_rating

  def getEmbeddings(self):
    embeddings_conlumns = [ col for col in df.columns if "embedding" in col ]
    return self.x[embeddings_conlumns]

  def getNotEmbeddings(self):
    not_embeddings_conlumns = [ col for col in df.columns if "embedding" not in col ]
    return self.x[not_embeddings_conlumns]

  def getAllScores(self):
    return self.y



In [27]:
# Generate train - Test Split this is done Once to be able to compare results
comments_pd = pd.DataFrame(comments, columns= [ f'comments_average_embeddings_{i}' for i in range(comments.shape[1])])
dataset_pd = pd.concat([listings_numeric, listing_name, listing_description, listing_neighborhood, listing_host_about, comments_pd], axis=1)
dataset_pd.shape

(6998, 1954)

In [34]:
# using the score as a referenc to understand which rows to drop, if no score is aviable 
dataset_with_no_null_scores_x = dataset_pd[df['review_scores_rating'].notna()]
dataset_with_no_null_scores_y = scores[df['review_scores_rating'].notna()]

In [38]:
x_train, x_test, y_train, y_test = train_test_split(dataset_with_no_null_scores_x, dataset_with_no_null_scores_y, test_size=0.2, random_state=0)

In [39]:
dataset_train = Dataset(x_train, y_train)
dataset_test = Dataset(x_test, y_test)

In [42]:
with open(f'{ROOT_PATH}/train_dataset.pkl', 'wb') as f:
    pickle.dump(dataset_train, f)

with open(f'{ROOT_PATH}/test_dataset.pkl', 'wb') as f:
    pickle.dump(dataset_test, f)