In [16]:
import openai
import pandas as pd
import time
import tqdm

In [None]:
# ✅ 设置 key 和 API 地址
openai.api_key = "####"  # 替换成你的 key
openai.api_base = "https://api.deepseek.com/v1"

In [None]:
# ✅ 设置输入路径并读取前 100 条
input_path = "~/original text.csv"
df = pd.read_csv(input_path)
# df = df.iloc[82:]

In [39]:
prompts = [
("stepwise", '''Rephrase the following paragraph while preserving its meaning. Follow these steps:
1️:Split the paragraph into individual sentences.
2️:Rephrase each sentence naturally while keeping the overall flow.
3️:Combine the rephrased sentences into a coherent paragraph.

Paragraph: "{text}"
Rephrased version:''')
]

In [35]:
# ✅ 5 个 Prompt 模板
prompts = [
    ("rewrite_simple", 'Rewrite the following paragraph:\nParagraph: "{text}"\nRewritten version:'),
    ("how_would_you", 'How would you rephrase this paragraph while preserving its original meaning?\nParagraph: "{text}"\nRephrased version:'),
    ("rephrase_preserve", 'Rephrase the following paragraph without changing the main content:\nParagraph: "{text}"\nRephrased version:'),
    
    ("imagine_lm", '''Imagine you are an advanced language model capable of rephrasing text while preserving its original meaning. If this were your paragraph, how would you naturally rephrase it?

Paragraph: "{text}"
Your rephrased version:''')
]

In [41]:
# ✅ 存储结果
results = []

for idx, row in df.iterrows():
    original = row['paragraph']
    output_row = {"original": original}

    for prompt_id, template in prompts:
        prompt = template.format(text=original)

        try:
            response = openai.ChatCompletion.create(
                model="deepseek-chat",
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                top_p=0.9,
                
            )
            raw_output = response["choices"][0]["message"]["content"]
        except Exception as e:
            raw_output = f"[ERROR] {e}"

        output_row[prompt_id] = raw_output
        time.sleep(0.5)  # 可以调大或删除

    results.append(output_row)
    print(f"{[idx+1]} ✅ Done")
    print(output_row)

[1] ✅ Done
{'original': 'When presented with these different realizations, the participant is asked to rate the pair on several qualities such as which is likely to be human generated, which follows the references well, which has good quality, and which exhibits good turn-to-turn coherency. For each of these the rater is asked to decide which in the pair showcases these qualities better. Note that the rater has the option of selecting both of them exhibit the quality of interest, or neither of them do. These were conducted in pairs to provide a frame of reference for the rater. We present the findings as paired results to account for grounding effects. Exact phrasings of these questions, several sample conversations, and details on our Turk setup can be found in Appendix A. We found inter-rater agreement in our studies about 75-80% of the time between 2 of the 3 users who judged samples, and about 10% of the time all 3 agreed unanimously. This is in light of 4 possible choices and 3 ra

In [None]:
# ✅ 保存为 CSV（列顺序和你需求一致）
col_order = ["original"] + [p[0] for p in prompts]
output_path = "~/deepseek_rephrased_prompt_5_format.csv"
pd.DataFrame(results)[col_order].to_csv(output_path, index=False)

print(f"\n✅ All done! Saved to: {output_path}")


✅ All done! Saved to: /Users/carina/Downloads/courses/paper/dataset for the paper/annotation/deepseek/deepseek_rephrased_prompt_5_format.csv


In [11]:

# ✅ 逐个 Prompt 生成结果
for pid, template in prompts:
    full_prompt = template.format(text=paragraph)

    response = openai.ChatCompletion.create(
        model="deepseek-chat",
        messages=[
        {"role": "user", "content": full_prompt}
    ],
        temperature=0.7,
        top_p=0.9,
        max_tokens=512
    )

    print(f"\n🟦 Prompt: {pid}")
    print(response["choices"][0]["message"]["content"])


🟦 Prompt: rewrite_simple
Here’s a refined version of your paragraph with improved clarity and flow:  

---  

Participants are presented with pairs of realizations and asked to evaluate them based on several criteria, such as perceived human origin, adherence to references, overall quality, and turn-to-turn coherence. For each criterion, raters must determine which of the two options performs better, though they may also indicate that both or neither meet the standard. This paired evaluation method ensures a consistent frame of reference and helps account for grounding effects. Additional details—including exact question phrasing, sample conversations, and the Mechanical Turk setup—are provided in Appendix A.  

In our study, inter-rater agreement was approximately 75–80% between two of the three evaluators, while unanimous agreement among all three occurred around 10% of the time. Given the four possible response choices and three raters, this level of consensus aligns with expectati

In [None]:
# ✅ Chat-style prompt
response = openai.ChatCompletion.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that rephrases English paragraphs."},
        {"role": "user", "content": "When presented with these different realizations, the participant is asked to rate the pair on several qualities such as which is likely to be human generated, which follows the references well, which has good quality, and which exhibits good turn-to-turn coherency. For each of these the rater is asked to decide which in the pair showcases these qualities better. Note that the rater has the option of selecting both of them exhibit the quality of interest, or neither of them do. These were conducted in pairs to provide a frame of reference for the rater. We present the findings as paired results to account for grounding effects. Exact phrasings of these questions, several sample conversations, and details on our Turk setup can be found in Appendix A. We found inter-rater agreement in our studies about 75-80% of the time between 2 of the 3 users who judged samples, and about 10% of the time all 3 agreed unanimously. This is in light of 4 possible choices and 3 raters. It should be noted that our goal is not to make the distribution between model and human statistically different, but rather to make them as close as possible. We have taken several steps to assure the quality of our human evaluations as mentioned in the previous paragraph.Beyond that, any experiment with sufficient statistical power would need a prohibitively expensive number of samples per comparison."}
    ],
    temperature=0.7,
    top_p=0.9,
    max_tokens=512
)

In [None]:
print(response["choices"][0]["message"]["content"])
