# Study on Yelp-Hat

Paper: https://davis.wpi.edu/dsrg/PROJECTS/YELPHAT/2020_ACL_Human_vs_Machine-2.pdf

Summary:
* __Do annotators carefully choose relevant words?__ Yes, as the collecting time and number of chosen words increase accross the sentence length.

Sigles:
* __HAM__ (Human Attention Map): what annotators denote
* __CAM__ (Consensus Attention Map): bitwise __AND__ operation of the HAMs
* __SAM__ (Super Attention Map): bitwise __OR__ operation of the HAMs. 


In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path

import sys
sys.path.append("./../src")

cache_path = path.join(os.getcwd(), '..', '.cache')

DATASET_NAME='yelp-hat'

dataset_path = path.join(cache_path, 'dataset', DATASET_NAME)

tmp_path = path.join('.cache', '2022-07-29')
os.makedirs(tmp_path,exist_ok=True)

In [2]:
# Download dataset zip
URL='https://github.com/cansusen/Human-Attention-for-Text-Classification/archive/205c1552bc7be7ec48623d79d85d4c6fbfe62362.zip'

Download and extract dataset

In [3]:
from torchtext.utils import download_from_url, extract_archive
import shutil

zip_path = download_from_url(URL, root=dataset_path, path=path.join(dataset_path, f'{DATASET_NAME}.zip'))
files = extract_archive(from_path=zip_path, to_path=dataset_path)

for f in files:
    if f.endswith('.csv'):
        shutil.copy2(f, dataset_path)

2.56MB [00:00, 5.04MB/s]


A quoi ressemble la dataset?

In [4]:
import pandas as pd

df = pd.read_csv(path.join(dataset_path, 'ham_part1(50words).csv'))
df

Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output
0,1,Out in Twinsburg for work and wasn't expecting...,yes,<span>Out</span> <span>in</span> <span>Twinsbu...
1,1,Out in Twinsburg for work and wasn't expecting...,yes,<span>Out</span> <span>in</span> <span>Twinsbu...
2,1,Out in Twinsburg for work and wasn't expecting...,yes,<span>Out</span> <span>in</span> <span>Twinsbu...
3,0,Very slow. Never been in the drive at any othe...,no,"<span class=""active"">Very</span> <span class=""..."
4,0,Very slow. Never been in the drive at any othe...,no,"<span>Very</span> <span class=""active"">slow.</..."
...,...,...,...,...
895,0,I went here to get a snack before I went on th...,no,<span>I</span> <span>went</span> <span>here</s...
896,0,I went here to get a snack before I went on th...,no,<span>I</span> <span>went</span> <span>here</s...
897,0,Always packed for lunch. Probably because Pit...,no,<span>Always</span> <span>packed</span> <span>...
898,0,Always packed for lunch. Probably because Pit...,no,<span>Always</span> <span>packed</span> <span>...


In [5]:
import re
def generate_binary_human_attention_vector(html, num_words_in_review, max_words):
    # Function provided by the dataset :
    # https://github.com/cansusen/Human-Attention-for-Text-Classification/blob/master/generate_ham/sample_generate.ipynb

    p = re.compile('<span(.*?)/span>')
    all_span_items = p.findall(html)

    if html == '{}':
        print('Empty human annotation - This should never print')
        return [0] * max_words

    if len(all_span_items) == num_words_in_review + 1:
        if (all_span_items[num_words_in_review] == '><') or (all_span_items[num_words_in_review] == ' data-vivaldi-spatnav-clickable="1"><'):

            binarized_human_attention = [0] * max_words
            for i in range(0, len(all_span_items) - 1):
                if 'class="active"' in all_span_items[i]:
                    binarized_human_attention[i] = 1

        else:
            print('This should never print.')
    else:
        print('This should never print.')

    return binarized_human_attention

MAX_WORDS = 100
i = 0
html = df['Answer.html_output'][i]
num_highlighted = html.count('class="active"')
num_words_in_review = len(df['Input.text'][i].split())

binarized_human_attention = generate_binary_human_attention_vector(html, num_words_in_review, MAX_WORDS)


print("Number of words highlighted in this review:",num_highlighted)
print("Original annotation:", html)
print("Binarized attention map:",binarized_human_attention)

Number of words highlighted in this review: 6
Original annotation: <span>Out</span> <span>in</span> <span>Twinsburg</span> <span>for</span> <span>work</span> <span>and</span> <span>wasn't</span> <span>expecting</span> <span>to</span> <span>find</span> <span>a</span> <span>well</span> <span>reviewed</span> <span>sushi</span> <span>restaurant</span> <span>but</span> <span class="active">glad</span> <span>I</span> <span>did.</span> <span>It</span> <span>was</span> <span>quite</span> <span>busy</span> <span>for</span> <span>a</span> <span>Monday</span> <span>and</span> <span>the</span> <span>poor</span> <span>waitress</span> <span>was</span> <span>slammed</span> <span>but</span> <span>the</span> <span>sushi</span> <span>chef</span> <span>stepped</span> <span>in</span> <span>to</span> <span>help</span> <span>and</span> <span>was</span> <span class="active">very</span> <span class="active">friendly.</span> <span>The</span> <span class="active">presentation</span> <span>and</span> <span class

In [6]:
def tokenize(html):
    p = re.compile(r'<span[^>]*>(.+?)</span>')
    return p.findall(html)

def human_attention(html):

    p = re.compile('<span(.*?)/span>')
    all_span_items = p.findall(html)
    if all_span_items[-1] == '><': all_span_items = all_span_items[:-1]

    return ['class="active"' in span_item for span_item in all_span_items]

v_hat = human_attention(html)
print(v_hat)

# Check if we tokenize html, can our function "human_attention" reproduce the exact same length

for fpath in os.listdir(dataset_path):
    
    if fpath.endswith('.csv'):
        print('Check',fpath)
        df = pd.read_csv(path.join(dataset_path, fpath))
        print(len(df['Answer.html_output']))
        for html in df['Answer.html_output']:
            tokens = tokenize(html)
            v_hat = human_attention(html)
            if len(tokens) != len(v_hat):
                print(len(tokens), len(v_hat))
                print(' '.tokens)
                display(HTML(html))

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, True, False, True, False, True]
Check ham_part7.csv
2096
Check ham_part5.csv
2999
Check ham_part8(200words).csv
543
Check ham_part6(100words).csv
1314
Check ham_part4.csv
3000
Check ham_part3.csv
3000
Check ham_part1(50words).csv
900


Proceed dataset step by step

array([ True,  True, False, False])

In [17]:
df.columns != 'Answer.html_output' and df.columns != 'Answer.Q1Answer'

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [25]:
df = pd.read_csv(path.join(dataset_path, 'ham_part3.csv'))
display(HTML('<h3>Raw dataset</h3>'))
display(df.head())

for idx in range(3):
    clean_df[f'ham_{idx}'] = clean_df[f'ham_html_{idx}'].apply(yelp_hat_ham)

dfs = [df.loc[0::3, (df.columns != 'Answer.html_output') & (df.columns != 'Answer.Q1Answer')].reset_index(drop=True)]
dfs += [df.loc[idx::3, ['Answer.html_output','Answer.Q1Answer']].reset_index(drop=True).rename(columns={'Answer.html_output':'ham_html_', 'Answer.Q1Answer': 'human_label_'}).add_suffix(str(idx)) for idx in range(3) ]
clean_df = pd.concat(dfs,axis=1).rename(columns={'Input.label': 'label', 'Input.text': 'text'})
clean_df = clean_df[['text', 'ham_html_0', 'human_label_0', 'ham_html_1', 'human_label_1', 'ham_html_2', 'human_label_2', 'label']]
display(HTML('<h3>Clean up dataset</h3>'))
display(clean_df.head())

from data.yelp_hat.utils import yelp_hat_ham, yelp_hat_token
clean_df['text_tokens'] = clean_df['ham_html_0'].apply(yelp_hat_token)

display(HTML('<h3>Binarize annotation</h3>'))
display(clean_df.head())

Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output
0,1,The food quality and portion size was awesome....,yes,"<span>The</span> <span class=""active"">food</sp..."
1,1,The food quality and portion size was awesome....,yes,<span>The</span> <span>food</span> <span class...
2,1,The food quality and portion size was awesome....,yes,<span>The</span> <span>food</span> <span>quali...
3,1,Found these guys at taste of calgary and had t...,yes,<span>Found</span> <span>these</span> <span>gu...
4,1,Found these guys at taste of calgary and had t...,yes,<span>Found</span> <span>these</span> <span>gu...


Unnamed: 0,text,ham_html_0,human_label_0,ham_html_1,human_label_1,ham_html_2,human_label_2,label
0,The food quality and portion size was awesome....,"<span>The</span> <span class=""active"">food</sp...",yes,<span>The</span> <span>food</span> <span class...,yes,<span>The</span> <span>food</span> <span>quali...,yes,1
1,Found these guys at taste of calgary and had t...,<span>Found</span> <span>these</span> <span>gu...,yes,<span>Found</span> <span>these</span> <span>gu...,yes,<span>Found</span> <span>these</span> <span>gu...,yes,1
2,Love the bar on the roof. Nice and relaxing f...,"<span class=""active"">Love</span> <span>the</sp...",yes,"<span class=""active"">Love</span> <span class=""...",yes,"<span class=""active"">Love</span> <span class=""...",yes,1
3,Food is awesome. Service is great. What a team...,<span>Food</span> <span>is</span> <span class=...,yes,"<span class=""active"">Food</span> <span>is</spa...",yes,<span>Food</span> <span>is</span> <span class=...,yes,1
4,I had eaten here in 2014 and it was an experie...,<span>I</span> <span>had</span> <span>eaten</s...,no,<span>I</span> <span>had</span> <span>eaten</s...,no,"<span>I</span> <span class=""active"">had</span>...",no,0


Unnamed: 0,text,ham_html_0,human_label_0,ham_html_1,human_label_1,ham_html_2,human_label_2,label,text_tokens,ham_0,ham_1,ham_2
0,The food quality and portion size was awesome....,"<span>The</span> <span class=""active"">food</sp...",yes,<span>The</span> <span>food</span> <span class...,yes,<span>The</span> <span>food</span> <span>quali...,yes,1,"[The, food, quality, and, portion, size, was, ...","[0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,Found these guys at taste of calgary and had t...,<span>Found</span> <span>these</span> <span>gu...,yes,<span>Found</span> <span>these</span> <span>gu...,yes,<span>Found</span> <span>these</span> <span>gu...,yes,1,"[Found, these, guys, at, taste, of, calgary, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Love the bar on the roof. Nice and relaxing f...,"<span class=""active"">Love</span> <span>the</sp...",yes,"<span class=""active"">Love</span> <span class=""...",yes,"<span class=""active"">Love</span> <span class=""...",yes,1,"[Love, the, bar, on, the, roof., Nice, and, re...","[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, ...","[1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, ..."
3,Food is awesome. Service is great. What a team...,<span>Food</span> <span>is</span> <span class=...,yes,"<span class=""active"">Food</span> <span>is</spa...",yes,<span>Food</span> <span>is</span> <span class=...,yes,1,"[Food, is, awesome., Service, is, great., What...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, ...","[0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,I had eaten here in 2014 and it was an experie...,<span>I</span> <span>had</span> <span>eaten</s...,no,<span>I</span> <span>had</span> <span>eaten</s...,no,"<span>I</span> <span class=""active"">had</span>...",no,0,"[I, had, eaten, here, in, 2014, and, it, was, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, ..."


<div class="alert alert-block alert-info">
    <b>Note:</b> By reconcatenating for spacy and re tokenize, do we obtain the coherent length?
</div>    

In [26]:
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import spacy 

nlp = spacy.load('en_core_web_sm')
soup = BeautifulSoup(html, 'html.parser')


def tokenize_ham(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = [str(tag.string) for tag in soup.find_all('span') if tag.string is not None]
    tokens = [str(tk.text) for doc in nlp.pipe(tags) for tk in doc]
    return tokens

files = [fpath for fpath in os.listdir(dataset_path) if fpath.endswith('.csv') and 'part7' not in fpath]

for fpath in tqdm(files, total=len(files)):
        
    df = pd.read_csv(path.join(dataset_path, fpath))

    dfs = [df.loc[0::3, df.columns != 'Answer.html_output'].reset_index(drop=True)]
    dfs += [df.loc[idx::3, ['Answer.html_output']].reset_index(drop=True).rename(columns={'Answer.html_output':'ham_html_'}).add_suffix(str(idx)) for idx in range(3) ]
    clean_df = pd.concat(dfs,axis=1).rename(columns={'Input.label': 'label', 'Input.text': 'text', 'Answer.Q1Answer':'human_label'})
    clean_df = clean_df[['text', 'ham_html_0', 'ham_html_1', 'ham_html_2', 'label', 'human_label']]

    clean_df['ham_tokens'] = clean_df['ham_html_0'].apply(tokenize_ham)
    clean_df['count_ham_tokens'] = clean_df['ham_tokens'].apply(lambda row: len(row))

    clean_df['text_tokens'] = [[tk.text for tk in doc if not tk.is_space] for doc in nlp.pipe(clean_df['text'].tolist())]
    clean_df['count_text_tokens'] = clean_df['text_tokens'].apply(lambda row: len(row))

    is_every_row_ok = (clean_df['count_text_tokens'] == clean_df['count_ham_tokens']).all()
    print(fpath, ': Same tokens between text and ham >', is_every_row_ok)

  0%|          | 0/6 [00:00<?, ?it/s]

ham_part5.csv : Same tokens between text and ham > True
ham_part8(200words).csv : Same tokens between text and ham > True
ham_part6(100words).csv : Same tokens between text and ham > True
ham_part4.csv : Same tokens between text and ham > True
ham_part3.csv : Same tokens between text and ham > True
ham_part1(50words).csv : Same tokens between text and ham > True


###### Split tokens in human attention maps by spacy, verify if `len(token) == len(annotation)`

In [27]:
def binarize_ham(html):
    soup = BeautifulSoup(html, 'html.parser')
    tags = [tag for tag in soup.find_all('span') if tag.string is not None]
    
    tag_annot = [int('active' in t.get('class', [])) for t in tags]
    tag_str = [str(t.string) for t in tags]
    
    ham = []
    
    for annot, splitted_tokens in zip(tag_annot, nlp.pipe(tag_str)):
        annotation = [annot * int(not tk.is_punct) for tk in splitted_tokens]
        ham += annotation
    
    return ham

html = clean_df['ham_html_0']
print(html[0])
print(tokenize_ham(html[0]))

<span>Out</span> <span>in</span> <span>Twinsburg</span> <span>for</span> <span>work</span> <span>and</span> <span>wasn't</span> <span>expecting</span> <span>to</span> <span>find</span> <span>a</span> <span>well</span> <span>reviewed</span> <span>sushi</span> <span>restaurant</span> <span>but</span> <span class="active">glad</span> <span>I</span> <span>did.</span> <span>It</span> <span>was</span> <span>quite</span> <span>busy</span> <span>for</span> <span>a</span> <span>Monday</span> <span>and</span> <span>the</span> <span>poor</span> <span>waitress</span> <span>was</span> <span>slammed</span> <span>but</span> <span>the</span> <span>sushi</span> <span>chef</span> <span>stepped</span> <span>in</span> <span>to</span> <span>help</span> <span>and</span> <span>was</span> <span class="active">very</span> <span class="active">friendly.</span> <span>The</span> <span class="active">presentation</span> <span>and</span> <span class="active">flavors</span> <span>were</span> <span class="active">gre

In [28]:
files = [fpath for fpath in os.listdir(dataset_path) if fpath.endswith('.csv') and 'part7' not in fpath]

for fpath in tqdm(files, total=len(files)):
        
    df = pd.read_csv(path.join(dataset_path, fpath))

    dfs = [df.loc[0::3, df.columns != 'Answer.html_output'].reset_index(drop=True)]
    dfs += [df.loc[idx::3, ['Answer.html_output']].reset_index(drop=True).rename(columns={'Answer.html_output':'annotation_html_'}).add_suffix(str(idx)) for idx in range(3) ]
    clean_df = pd.concat(dfs,axis=1).rename(columns={'Input.label': 'label', 'Input.text': 'text', 'Answer.Q1Answer':'human_label'})
    clean_df = clean_df[['text', 'ham_html_0', 'ham_html_1', 'ham_html_2', 'label', 'human_label']]

    clean_df['ham_tokens'] = clean_df['ham_html_0'].apply(binarize_ham)
    clean_df['count_ham_tokens'] = clean_df['ham_tokens'].apply(lambda row: len(row))

    clean_df['text_tokens'] = [[tk.text for tk in doc if not tk.is_space] for doc in nlp.pipe(clean_df['text'].tolist())]
    clean_df['count_text_tokens'] = clean_df['text_tokens'].apply(lambda row: len(row))

    is_every_row_ok = (clean_df['count_text_tokens'] == clean_df['count_ham_tokens']).all()
    print(fpath, ': Same tokens between text and ham >', is_every_row_ok)

  0%|          | 0/6 [00:00<?, ?it/s]

KeyError: "['ham_html_0', 'ham_html_1', 'ham_html_2'] not in index"

## Check sentence length

In [30]:
for fpath in tqdm(files, total=len(files)):
        
    df = pd.read_csv(path.join(dataset_path, fpath))
    display(HTML(f'<h3>{fpath}</h3>'))
    dfs = [df.loc[0::3, df.columns != 'Answer.html_output'].reset_index(drop=True)]
    dfs += [df.loc[idx::3, ['Answer.html_output']].reset_index(drop=True).rename(columns={'Answer.html_output':'ham_html_'}).add_suffix(str(idx)) for idx in range(3) ]
    clean_df = pd.concat(dfs,axis=1).rename(columns={'Input.label': 'label', 'Input.text': 'text', 'Answer.Q1Answer':'human_label'})
    clean_df = clean_df[['text', 'ham_html_0', 'ham_html_1', 'ham_html_2', 'label', 'human_label']]
    clean_df['text_tokens'] = [[tk.text for tk in doc if not tk.is_space] for doc in nlp.pipe(clean_df['text'].tolist())]
    
    clean_df['text_length'] = clean_df.text_tokens.str.len()
    display(clean_df.describe().transpose())

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1000.0,0.516,0.499994,0.0,0.0,1.0,1.0,1.0
text_length,1000.0,71.694,7.983865,55.0,66.0,71.0,78.0,97.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,181.0,0.508287,0.501318,0.0,0.0,1.0,1.0,1.0
text_length,181.0,230.071823,8.31734,210.0,224.0,230.0,235.0,252.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,438.0,0.513699,0.500384,0.0,0.0,1.0,1.0,1.0
text_length,438.0,115.276256,5.565827,101.0,112.0,114.0,118.0,144.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1000.0,0.507,0.500201,0.0,0.0,1.0,1.0,1.0
text_length,1000.0,71.604,7.837641,55.0,65.0,71.0,78.0,101.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,1000.0,0.497,0.500241,0.0,0.0,0.0,1.0,1.0
text_length,1000.0,71.958,8.178944,53.0,65.0,72.0,78.0,107.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
label,300.0,0.483333,0.500557,0.0,0.0,0.0,1.0,1.0
text_length,300.0,58.006667,3.009454,50.0,56.0,57.0,60.0,72.0


### Handle case part7.csv

In [33]:
df = pd.read_csv(path.join(dataset_path, 'ham_part7.csv'))
df['ham_html_'] = df[f'Answer.html_output'].apply(yelp_hat_ham)
display(HTML('<h3>Part 7</h3>'))
display(df.head())

duplicates = df.groupby(df['Input.text'].tolist(),as_index=False).size()
display(HTML('<h3>Duplication du part 7</h3>'))
display(duplicates.head())

Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output,ham_html_
0,0,*knocks on door*\n*walks into restaurant*\n*go...,no,<span>*knocks</span> <span>on</span> <span>doo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,0,*knocks on door*\n*walks into restaurant*\n*go...,no,<span>*knocks</span> <span>on</span> <span>doo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,*knocks on door*\n*walks into restaurant*\n*go...,no,<span>*knocks</span> <span>on</span> <span>doo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,0,$18.95 per person....Was really hungry for lun...,no,<span>$18.95</span> <span>per</span> <span>per...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,0,$18.95 per person....Was really hungry for lun...,no,<span>$18.95</span> <span>per</span> <span>per...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Unnamed: 0,index,size
0,$18.95 per person....Was really hungry for lun...,3
1,*knocks on door*\n*walks into restaurant*\n*go...,3
2,1. Happy hour pricing is only available in the...,3
3,20 Minute wait per dish. Never coming here aga...,3
4,"2nd time eating here, We drive 35 miles from H...",3


In [56]:
dupli_2 = duplicates[duplicates['size'] < 3]

for d in tqdm(dupli_2['index'][:3]):
    display(df.loc[df['Input.text'] == d])
    
for dup_rows in [2, 3, 4]:
    print('Duplicate of', dup_rows, ':', sum(duplicates['size'] == dup_rows))

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output,ham_html_
197,0,"Cold coffee, cold donut, flies EVERYWHERE, bis...",no,"<span class=""active"">Cold</span> <span class=""...","[1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ..."
198,0,"Cold coffee, cold donut, flies EVERYWHERE, bis...",no,"<span>Cold</span> <span>coffee,</span> <span>c...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output,ham_html_
199,1,"Compared to others in the area, Scramble beats...",yes,<span>Compared</span> <span>to</span> <span>ot...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
200,1,"Compared to others in the area, Scramble beats...",yes,<span>Compared</span> <span>to</span> <span>ot...,"[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, ..."


Unnamed: 0,Input.label,Input.text,Answer.Q1Answer,Answer.html_output,ham_html_
261,0,DON'T waste your Time Money or Abuse your Tumm...,no,"<span>DON'T</span> <span class=""active"">waste<...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ..."
262,0,DON'T waste your Time Money or Abuse your Tumm...,no,<span>DON'T</span> <span>waste</span> <span>yo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


Duplicate of 2 : 130
Duplicate of 3 : 144
Duplicate of 4 : 351


In [61]:
# Drop all text that has only 2 reviews:
df2 = df[~df['Input.text'].isin(dupli_2['index'])]

# Verify if no more 2-row duplicate exist
duplicates_2 = df2.groupby(df2['Input.text'].tolist(),as_index=False).size()
display(duplicates_2['size'].unique())
print('>> Expected not to have "2" after drop')

array([3, 4])

>> Expected not to have "2" after drop


In [66]:
duplicates_2['size'].unique() == [3]

array([ True, False])

In [58]:
# Drop 4th annotation
df3 = df2.groupby('Input.text').head(3).reset_index(drop=True)

# Check if no more duplicate of 4:
duplicates_3 = df3.groupby(df3['Input.text'].tolist(),as_index=False).size()
display(duplicates_3['size'].unique())

print('Check correctly remove number of duplicate of 4', len(df3) + len(text_4) == len(df2))

array([3])

Check correctly remove number of duplicate of 4 True


In [60]:
clean_df = df3

In [69]:
duplicates_3['size'].unique()[0] == 3

True