In [5]:
import sys
from pathlib import Path
base_path = Path('../..')
sys.path.insert(0, str(base_path))

In [7]:
import tqdm
import torch
import pickle
import warnings
import vec2text
import numpy as np
import pandas as pd
from typing import List
import ot # !pip install POT
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer
warnings.filterwarnings(action='ignore')

# Load dataset

In [8]:
dataset = load_dataset("CEBaB/CEBaB")
dataset

Downloading metadata:   0%|          | 0.00/2.74k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/424k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1673 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1689 [00:00<?, ? examples/s]

Generating train_exclusive split:   0%|          | 0/1755 [00:00<?, ? examples/s]

Generating train_inclusive split:   0%|          | 0/11728 [00:00<?, ? examples/s]

Generating train_observational split:   0%|          | 0/1755 [00:00<?, ? examples/s]

DatasetDict({
    validation: Dataset({
        features: ['id', 'original_id', 'edit_id', 'is_original', 'edit_goal', 'edit_type', 'edit_worker', 'description', 'review_majority', 'review_label_distribution', 'review_workers', 'food_aspect_majority', 'ambiance_aspect_majority', 'service_aspect_majority', 'noise_aspect_majority', 'food_aspect_label_distribution', 'ambiance_aspect_label_distribution', 'service_aspect_label_distribution', 'noise_aspect_label_distribution', 'food_aspect_validation_workers', 'ambiance_aspect_validation_workers', 'service_aspect_validation_workers', 'noise_aspect_validation_workers', 'opentable_metadata'],
        num_rows: 1673
    })
    test: Dataset({
        features: ['id', 'original_id', 'edit_id', 'is_original', 'edit_goal', 'edit_type', 'edit_worker', 'description', 'review_majority', 'review_label_distribution', 'review_workers', 'food_aspect_majority', 'ambiance_aspect_majority', 'service_aspect_majority', 'noise_aspect_majority', 'food_aspect_la

In [18]:
dataset['train_exclusive']

Dataset({
    features: ['id', 'original_id', 'edit_id', 'is_original', 'edit_goal', 'edit_type', 'edit_worker', 'description', 'review_majority', 'review_label_distribution', 'review_workers', 'food_aspect_majority', 'ambiance_aspect_majority', 'service_aspect_majority', 'noise_aspect_majority', 'food_aspect_label_distribution', 'ambiance_aspect_label_distribution', 'service_aspect_label_distribution', 'noise_aspect_label_distribution', 'food_aspect_validation_workers', 'ambiance_aspect_validation_workers', 'service_aspect_validation_workers', 'noise_aspect_validation_workers', 'opentable_metadata'],
    num_rows: 1755
})

In [19]:
dataset['train_inclusive']

Dataset({
    features: ['id', 'original_id', 'edit_id', 'is_original', 'edit_goal', 'edit_type', 'edit_worker', 'description', 'review_majority', 'review_label_distribution', 'review_workers', 'food_aspect_majority', 'ambiance_aspect_majority', 'service_aspect_majority', 'noise_aspect_majority', 'food_aspect_label_distribution', 'ambiance_aspect_label_distribution', 'service_aspect_label_distribution', 'noise_aspect_label_distribution', 'food_aspect_validation_workers', 'ambiance_aspect_validation_workers', 'service_aspect_validation_workers', 'noise_aspect_validation_workers', 'opentable_metadata'],
    num_rows: 11728
})

In [None]:
"""
It seems like train_inclusive include the edits while train_exclusive only contains the original reviews.
""";

In [40]:
train_df = dataset['train_inclusive'].to_pandas()
columns = ['id', 'original_id', 'edit_id', 'is_original', 'edit_type', 'edit_goal', 
           'food_aspect_majority', 'ambiance_aspect_majority', 'service_aspect_majority', 'noise_aspect_majority'] 
filtered_train_df = train_df[columns]
filtered_train_df 

Unnamed: 0,id,original_id,edit_id,is_original,edit_type,edit_goal,food_aspect_majority,ambiance_aspect_majority,service_aspect_majority,noise_aspect_majority
0,0,0,0,True,,,,,Negative,unknown
1,1,0,1,False,noise,Negative,,,Negative,Negative
2,2,0,2,False,service,Positive,,,Positive,unknown
3,3,0,3,False,noise,Positive,,,Negative,Positive
4,1000000,1,0,True,,,Negative,,Negative,
...,...,...,...,...,...,...,...,...,...,...
11723,1754000001,1754,1,False,food,Negative,Negative,unknown,Positive,
11724,1754000002,1754,2,False,ambiance,Negative,Positive,unknown,Positive,
11725,1754000003,1754,3,False,service,Negative,Positive,unknown,Negative,
11726,1754000004,1754,4,False,food,unknown,unknown,unknown,Positive,


In [53]:
# Step 1: Create a boolean DataFrame where True indicates an empty string
empty_cells = filtered_train_df.applymap(lambda cell: cell == '')

# Step 2: Identify rows with at least one empty string
rows_with_empty_cells = empty_cells.any(axis=1)

# Step 3: Filter out those rows
filtered_train_df = filtered_train_df[~rows_with_empty_cells]
filtered_train_df 

Unnamed: 0,id,original_id,edit_id,is_original,edit_type,edit_goal,food_aspect_majority,ambiance_aspect_majority,service_aspect_majority,noise_aspect_majority
9,2000000,2,0,True,,,Positive,unknown,Negative,unknown
10,2000001,2,1,False,food,Negative,Negative,unknown,Negative,unknown
11,2000002,2,2,False,service,Positive,Positive,unknown,Positive,unknown
12,2000003,2,3,False,noise,Positive,Positive,unknown,Negative,Negative
13,2000004,2,4,False,ambiance,Positive,Positive,Positive,Negative,unknown
...,...,...,...,...,...,...,...,...,...,...
11702,1750000004,1750,4,False,food,Positive,Positive,Negative,unknown,Negative
11703,1750000005,1750,5,False,food,Negative,Negative,Negative,unknown,Negative
11704,1750000006,1750,6,False,service,Positive,unknown,Negative,unknown,Negative
11705,1750000007,1750,7,False,ambiance,unknown,unknown,unknown,unknown,Negative
