# Dataset Summarize From Feedback

Take care of all the data cleaning to get the formatting to follow MT-Bench standards.

In [1]:
import pandas as pd
import plotly as ply
import numpy as np
import os
from tqdm import tqdm
import pickle

In [2]:
from datasets import load_dataset, load_dataset_builder

In [3]:
ds_builder = load_dataset_builder("openai/summarize_from_feedback", "comparisons")

In [4]:
ds_builder.info.description

'Summarize from Feedback contains the human feedback data released by the "Learning to summarize from human feedback" paper.\n'

In [5]:
ds_builder.info.features

{'info': {'id': Value(dtype='string', id=None),
  'post': Value(dtype='string', id=None),
  'title': Value(dtype='string', id=None),
  'subreddit': Value(dtype='string', id=None),
  'site': Value(dtype='string', id=None),
  'article': Value(dtype='string', id=None)},
 'summaries': [{'text': Value(dtype='string', id=None),
   'policy': Value(dtype='string', id=None),
   'note': Value(dtype='string', id=None)}],
 'choice': Value(dtype='int32', id=None),
 'worker': Value(dtype='string', id=None),
 'batch': Value(dtype='string', id=None),
 'split': Value(dtype='string', id=None),
 'extra': {'confidence': Value(dtype='int32', id=None)}}

In [6]:
train_data = load_dataset('openai/summarize_from_feedback', "comparisons", split ='train[0:5000]')

Found cached dataset summarize_from_feedback (/Users/johnwang/.cache/huggingface/datasets/openai___summarize_from_feedback/comparisons/0.0.0/483f970ceb55b926b0a087ef4f678ab1b089bc8174a107a452c6152e88af7ff0)


In [7]:
train_data

Dataset({
    features: ['info', 'summaries', 'choice', 'worker', 'batch', 'split', 'extra'],
    num_rows: 5000
})

In [8]:
df = train_data.to_pandas()

In [9]:
df[df['split']=='train'].shape

(5000, 7)

In [10]:
df = df.drop(["batch", "split", "extra", "worker"], axis="columns")

In [11]:
df['post'] = df['info'].str['post']
df['title'] = df['info'].str['title']
df['subreddit'] = df['info'].str['subreddit']

In [12]:
df = df.drop("info", axis="columns")
df["question_id"] = pd.Series(range(df.shape[0]))
df["category"] = pd.Series((["summary"]*df.shape[0]))
df.head()

Unnamed: 0,summaries,choice,post,title,subreddit,question_id,category
0,[{'text': ' Mum is mad at me for not flying on...,1,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,0,summary
1,[{'text': ' I have made sure my mother is comf...,1,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,1,summary
2,[{'text': ' mum isn't speaking to me because I...,0,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,2,summary
3,[{'text': ' Mum thought I was going to road tr...,0,My boyfriend and I are long distance. We have ...,Mother [51] not speaking to me [21] because of...,relationships,3,summary
4,[{'text': ' My landlord is harassing me and my...,1,My landlord left a falsified message taped to ...,Can I sue my property management company and l...,AskReddit,4,summary


In [13]:
df['title'] = "TITLE: " + df['title']
df['post'] = "POST: " + df['post']
df['subreddit'] = "SUBREDDIT: " + df['subreddit']

In [14]:
df['turns'] = df['subreddit'] + " " + df['title'] + " " + df['post']

In [15]:
df.head()

Unnamed: 0,summaries,choice,post,title,subreddit,question_id,category,turns
0,[{'text': ' Mum is mad at me for not flying on...,1,POST: My boyfriend and I are long distance. We...,TITLE: Mother [51] not speaking to me [21] bec...,SUBREDDIT: relationships,0,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
1,[{'text': ' I have made sure my mother is comf...,1,POST: My boyfriend and I are long distance. We...,TITLE: Mother [51] not speaking to me [21] bec...,SUBREDDIT: relationships,1,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
2,[{'text': ' mum isn't speaking to me because I...,0,POST: My boyfriend and I are long distance. We...,TITLE: Mother [51] not speaking to me [21] bec...,SUBREDDIT: relationships,2,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
3,[{'text': ' Mum thought I was going to road tr...,0,POST: My boyfriend and I are long distance. We...,TITLE: Mother [51] not speaking to me [21] bec...,SUBREDDIT: relationships,3,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
4,[{'text': ' My landlord is harassing me and my...,1,POST: My landlord left a falsified message tap...,TITLE: Can I sue my property management compan...,SUBREDDIT: AskReddit,4,summary,SUBREDDIT: AskReddit TITLE: Can I sue my prope...


In [22]:
answers = df[['question_id', 'summaries']]
answers['model0'] = df['summaries'].str[0].str['text']
answers['model1'] = df['summaries'].str[1].str['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers['model0'] = df['summaries'].str[0].str['text']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  answers['model1'] = df['summaries'].str[1].str['text']


In [24]:
model0 = answers[['question_id', 'model0']].rename(columns={'model0':'answer'})
model1= answers[['question_id', 'model1']].rename(columns={'model1':'answer'})
model0.head()

Unnamed: 0,question_id,answer
0,0,Mum is mad at me for not flying on my own tri...
1,1,I have made sure my mother is comfortable wit...
2,2,mum isn't speaking to me because I booked a f...
3,3,Mum thought I was going to road trip with my ...
4,4,My landlord is harassing me and my neighbours...


# Create Questions JSONL: (sff_questions.jsonl)

In [124]:
questions = df[['question_id', 'category', 'turns']].copy()
questions.head()

Unnamed: 0,question_id,category,turns
0,0,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
1,1,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
2,2,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
3,3,summary,SUBREDDIT: relationships TITLE: Mother [51] no...
4,4,summary,SUBREDDIT: AskReddit TITLE: Can I sue my prope...


In [125]:
questions['turns'] = 'Given the context of the specified subreddit and title, summarize the post. ' + questions['turns']
questions['turns'] = questions['turns'].apply(lambda x: [x])

In [130]:
q_json = questions.to_json(orient='records')
#q_json

In [134]:
import json
with open("sff_test.jsonl", "w") as file:
    for item in questions.to_dict('records'):
        json_line = json.dumps(item)
        file.write(json_line + '\n')