# Dataset Summarize From Feedback

Take care of all the data cleaning to get the formatting to follow MT-Bench standards.

In [106]:
import pandas as pd
import plotly as ply
import numpy as np
import os
from tqdm import tqdm
import pickle

In [107]:
from datasets import load_dataset, load_dataset_builder

In [108]:
ds_builder = load_dataset_builder("openai/summarize_from_feedback", "comparisons")

In [109]:
ds_builder.info.description

'Summarize from Feedback contains the human feedback data released by the "Learning to summarize from human feedback" paper.\n'

In [110]:
ds_builder.info.features

{'info': {'id': Value(dtype='string', id=None),
  'post': Value(dtype='string', id=None),
  'title': Value(dtype='string', id=None),
  'subreddit': Value(dtype='string', id=None),
  'site': Value(dtype='string', id=None),
  'article': Value(dtype='string', id=None)},
 'summaries': [{'text': Value(dtype='string', id=None),
   'policy': Value(dtype='string', id=None),
   'note': Value(dtype='string', id=None)}],
 'choice': Value(dtype='int32', id=None),
 'worker': Value(dtype='string', id=None),
 'batch': Value(dtype='string', id=None),
 'split': Value(dtype='string', id=None),
 'extra': {'confidence': Value(dtype='int32', id=None)}}

In [111]:
train_data = load_dataset('openai/summarize_from_feedback', "comparisons", split ='train[0:5000]')

Found cached dataset summarize_from_feedback (/Users/johnwang/.cache/huggingface/datasets/openai___summarize_from_feedback/comparisons/0.0.0/483f970ceb55b926b0a087ef4f678ab1b089bc8174a107a452c6152e88af7ff0)


In [112]:
train_data

Dataset({
    features: ['info', 'summaries', 'choice', 'worker', 'batch', 'split', 'extra'],
    num_rows: 5000
})

In [113]:
df = train_data.to_pandas()

In [115]:
df[df['split']=='train'].shape

(5000, 7)

In [116]:
df = df.drop(["batch", "split", "extra", "worker"], axis="columns")

In [119]:
df['post'] = df['info'].str['post']
df['title'] = df['info'].str['title']
df['subreddit'] = df['info'].str['subreddit']

In [121]:
df = df.drop("info", axis="columns")
df["question_id"] = pd.Series(range(df.shape[0]))
df["category"] = pd.Series((["summary"]*df.shape[0]))
df.head()

Unnamed: 0,summaries,choice,post,title,subreddit,question_id,category
0,[{'text': ' Mum is mad at me for not flying on...,1,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,0,summary
1,[{'text': ' I have made sure my mother is comf...,1,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,1,summary
2,[{'text': ' mum isn't speaking to me because I...,0,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,2,summary
3,[{'text': ' Mum thought I was going to road tr...,0,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,3,summary
4,[{'text': ' My landlord is harassing me and my...,1,My landlord left a falsified message taped to ...,Can I sue my property management company and l...,AskReddit,4,summary
...,...,...,...,...,...,...,...
4995,"[{'text': ' I already feel ready to have kids,...",1,My partner (male) and I (female) are nearly 23...,Has anyone ever thought for years that they'd ...,AskReddit,4995,summary
4996,[{'text': ' 23 year old couple of nearly 6 yea...,0,My partner (male) and I (female) are nearly 23...,Has anyone ever thought for years that they'd ...,AskReddit,4996,summary
4997,[{'text': ' Has anyone ever thought for years ...,1,My partner (male) and I (female) are nearly 23...,Has anyone ever thought for years that they'd ...,AskReddit,4997,summary
4998,[{'text': ' Partner and I have decided not to ...,0,My partner (male) and I (female) are nearly 23...,Has anyone ever thought for years that they'd ...,AskReddit,4998,summary


In [122]:
df['title'] = "TITLE: " + df['title']
df['post'] = "POST: " + df['post']
df['subreddit'] = "SUBREDDIT: " + df['subreddit']

In [123]:
df['turns'] = df['subreddit'] + " " + df['title'] + " " + df['post']

# Create Questions JSONL: (sff_questions.jsonl)

In [124]:
questions = df[['question_id', 'category', 'turns']].copy()
questions.head()

Unnamed: 0,question_id,category,turns
0,0,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
1,1,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
2,2,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
3,3,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
4,4,summary,SUBREDDIT: AskReddit TITLE: Can I sue my prope...


In [125]:
questions['turns'] = 'Given the context of the specified subreddit and title, summarize the post. ' + questions['turns']
questions['turns'] = questions['turns'].apply(lambda x: [x])

In [130]:
q_json = questions.to_json(orient='records')
#q_json

In [134]:
import json
with open("sff_test.jsonl", "w") as file:
    for item in questions.to_dict('records'):
        json_line = json.dumps(item)
        file.write(json_line + '\n')