In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import random
import os
from tqdm import tqdm

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Load human text (from Persuade corpus)
print("Loading Persuade corpus for human texts...")
persuade_dataset = load_dataset("persuade/persuade-corpus", split="train")
persuade_df = pd.DataFrame(persuade_dataset)

# Keep only human-written texts (generated == 0)
human_df = persuade_df[persuade_df['generated'] == 0].copy()
human_df = human_df[['text']].copy()
human_df['generated'] = 0

print(f"Number of human texts: {len(human_df)}")

# 2. Generate AI text using SlimPajama dataset as prompts
print("Loading SlimPajama dataset for prompts...")
slimpajama_dataset = load_dataset("cerebras/SlimPajama-627B", split="train", streaming=True)
slimpajama_samples = []

# Sample texts from SlimPajama as prompts
num_samples = min(5000, len(human_df) * 2)
print(f"Sampling {num_samples} texts from SlimPajama...")

for i, sample in enumerate(slimpajama_dataset):
    if i >= num_samples:
        break
    
    text = sample['text']
    sentences = text.split('.')
    if len(sentences) > 3:
        prompt = '.'.join(sentences[:2]) + '.'
        slimpajama_samples.append(prompt)
    elif len(text) > 10:
        prompt = text[:min(100, len(text))]
        slimpajama_samples.append(prompt)

print(f"Sampling completed, obtained {len(slimpajama_samples)} text prompts")

# Load non-instruction-tuned models for text generation
print("Loading LLaMA 2 model...")
llama_model_name = "meta-llama/Llama-2-7b"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16).to(device)

print("Loading Mistral model...")
mistral_model_name = "mistralai/Mistral-7B-v0.1"
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name, torch_dtype=torch.float16).to(device)

# Generate AI text
def generate_completion(prompt, model, tokenizer, max_length=200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=0.8,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completion = generated_text[len(prompt):]
    return prompt + completion

print("Starting AI text generation...")
ai_texts = []
for i, prompt in enumerate(tqdm(slimpajama_samples)):
    try:
        if i % 2 == 0 and i < len(slimpajama_samples) // 2:
            # Use LLaMA 2
            generated_text = generate_completion(prompt, llama_model, llama_tokenizer)
            ai_texts.append({"text": generated_text, "model": "llama2"})
        else:
            # Use Mistral
            generated_text = generate_completion(prompt, mistral_model, mistral_tokenizer)
            ai_texts.append({"text": generated_text, "model": "mistral"})
            
        if (i+1) % 100 == 0:
            print(f"Generated {i+1}/{len(slimpajama_samples)} AI texts")
    except Exception as e:
        print(f"Error generating sample {i}: {e}")
        continue

# Create DataFrame with AI-generated texts
ai_df = pd.DataFrame(ai_texts)
ai_df = ai_df[['text']].copy()
ai_df['generated'] = 1

# Ensure AI text count is 10x the human text count
if len(ai_df) < 10 * len(human_df):
    multiplier = (10 * len(human_df)) // len(ai_df) + 1
    ai_df = pd.concat([ai_df] * multiplier)
    ai_df = ai_df.sample(n=10 * len(human_df), random_state=42)

print(f"Number of AI-generated texts: {len(ai_df)}")

# 3. Combine datasets and shuffle
combined_df = pd.concat([human_df, ai_df], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 4. Save the new training set
output_path = 'combined_training_data_large.csv'
combined_df.to_csv(output_path, index=False)
print(f"\nNew training data saved to: {output_path}")
print(f"Total samples: {len(combined_df)} (Human: {len(human_df)}, AI: {len(ai_df)})")

# 5. Create SlimPajama dataset for the AI models
slimpajama_df = pd.DataFrame(ai_texts)
slimpajama_df.to_csv('slimpajama.csv', index=False)
print("SlimPajama dataset saved to: slimpajama.csv")

# 6. Create persuade_combined.csv (the original file with both human and AI texts)
# Sample some AI texts to create the AI portion of persuade
persuade_ai_count = len(persuade_df[persuade_df['generated'] == 1])
persuade_ai_texts = ai_df.sample(n=min(persuade_ai_count, len(ai_df)), random_state=42)

# Extend persuade_df with the necessary columns
if 'prompt' not in persuade_df.columns:
    persuade_df['prompt'] = ""
if 'model' not in persuade_df.columns:
    persuade_df['model'] = "human"

# Prepare AI portion for persuade
persuade_ai_texts['prompt'] = ""
persuade_ai_texts['model'] = "ai_generated"

# Select only the columns that match persuade_df
persuade_ai_cols = list(set(persuade_df.columns).intersection(set(persuade_ai_texts.columns)))
persuade_ai_texts = persuade_ai_texts[persuade_ai_cols]

# Add any missing columns in persuade_ai_texts with default values
for col in persuade_df.columns:
    if col not in persuade_ai_texts.columns:
        persuade_ai_texts[col] = None  # Use appropriate default values

# Keep only the human portion of the original persuade
persuade_human = persuade_df[persuade_df['generated'] == 0].copy()

# Combine human and generated portions to create persuade_combined
persuade_combined = pd.concat([persuade_human, persuade_ai_texts], ignore_index=True)
persuade_combined = persuade_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save persuade_combined
persuade_combined.to_csv('persuade_combined.csv', index=False)
print("Persuade combined dataset saved to: persuade_combined.csv")

# 7. Create test_essays.csv for evaluation
test_essays = ai_df.sample(n=min(200, len(ai_df)), random_state=42)
test_essays['id'] = [f"test_{i}" for i in range(len(test_essays))]
test_essays = test_essays[['id', 'text']]
test_essays.to_csv('test_essays.csv', index=False)
print("Test essays file created: test_essays.csv")

print("\nAll datasets have been successfully generated.")